diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index 07f0db3339ad9053dc95b284c4ae14e014efff89..0000000000000000000000000000000000000000
--- a/.gitattributes
+++ /dev/null
@@ -1,16 +0,0 @@
-*.bin.* filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tar.gz filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/__pycache__/__init__.cpython-38.pyc b/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72bbdd3aa778c8d95b09d14f3d92ee2c8285f2fa
Binary files /dev/null and b/__pycache__/__init__.cpython-38.pyc differ
diff --git a/__pycache__/__init__.cpython-39.pyc b/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1770ecb9d0a9eeb20e893550e7431d9f22e39a8
Binary files /dev/null and b/__pycache__/__init__.cpython-39.pyc differ
diff --git a/__pycache__/conftest.cpython-38-pytest-6.1.2.pyc b/__pycache__/conftest.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db98b1183c62ea6cf876965eeac89e878cef095d
Binary files /dev/null and b/__pycache__/conftest.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc b/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..507b5e85acb303cccd3b7b8a9f7940e38ea83cfe
Binary files /dev/null and b/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/conftest.cpython-38-pytest-6.2.2.pyc b/__pycache__/conftest.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..494de724230dfb9f0d5b29fefd0cbb31b850a218
Binary files /dev/null and b/__pycache__/conftest.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/conftest.cpython-38-pytest-6.2.4.pyc b/__pycache__/conftest.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..494de724230dfb9f0d5b29fefd0cbb31b850a218
Binary files /dev/null and b/__pycache__/conftest.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc b/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5b184f82f798fabe1d927b91b16be3c78e61735
Binary files /dev/null and b/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_activations.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_activations.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1f3ec4bd23c2dddabe6ab481768864112379463
Binary files /dev/null and b/__pycache__/test_activations.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_activations.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_activations.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de02141b45b385481ce39301b9f19b6f90a755e1
Binary files /dev/null and b/__pycache__/test_activations.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_activations.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_activations.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33a97afa27e9bdc1055e32cbee0ef12fedbcc4a2
Binary files /dev/null and b/__pycache__/test_activations.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_activations_tf.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_activations_tf.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9d6d78d4f239623e9d6bed056dd1c17bd11faae
Binary files /dev/null and b/__pycache__/test_activations_tf.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_activations_tf.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_activations_tf.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf00f87b5cd6265c1a0453b8c47511d44cb7eb22
Binary files /dev/null and b/__pycache__/test_activations_tf.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_activations_tf.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_activations_tf.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf00f87b5cd6265c1a0453b8c47511d44cb7eb22
Binary files /dev/null and b/__pycache__/test_activations_tf.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_benchmark.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_benchmark.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f881526fb0d0265503158a0eb35e36601886ea2
Binary files /dev/null and b/__pycache__/test_benchmark.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_benchmark.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_benchmark.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..526285a7ddaafdfde8a5113e7b55c962f6a477a1
Binary files /dev/null and b/__pycache__/test_benchmark.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_benchmark.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_benchmark.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cd55fce023b374769ed902494d6e11a7ad47917
Binary files /dev/null and b/__pycache__/test_benchmark.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f7660c23d6d7c053a8b62424977490fc18c846b
Binary files /dev/null and b/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0579adb7784f9a7b437553700d0f3aa847339f0e
Binary files /dev/null and b/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0579adb7784f9a7b437553700d0f3aa847339f0e
Binary files /dev/null and b/__pycache__/test_benchmark_tf.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_cli.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_cli.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a91d3e98e9e03e1539f7f2b29a8a691d12c2066
Binary files /dev/null and b/__pycache__/test_cli.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_cli.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_cli.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90e0368d48d75e3ae5955a1384e252361f7aba6c
Binary files /dev/null and b/__pycache__/test_cli.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_cli.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_cli.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42ebae07ed2a1ad717e3f52902771a4260d8fad0
Binary files /dev/null and b/__pycache__/test_cli.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46ee7c988eabbbde1b97c24dd72ef56cbf9eee5f
Binary files /dev/null and b/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9991cf475a87770c6b0522d301f07b687f72775b
Binary files /dev/null and b/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9991cf475a87770c6b0522d301f07b687f72775b
Binary files /dev/null and b/__pycache__/test_configuration_auto.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_configuration_common.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_configuration_common.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc26dbfb447ef75810870ab241e327681ed4a89e
Binary files /dev/null and b/__pycache__/test_configuration_common.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_configuration_common.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_configuration_common.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..751f3e3f6845682ff1e4cbb002b368cb93d0b169
Binary files /dev/null and b/__pycache__/test_configuration_common.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_configuration_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_configuration_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d672bb69ddb38a34b7e9a15655bf565e7133e130
Binary files /dev/null and b/__pycache__/test_configuration_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_configuration_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_configuration_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a487ba874c93da212a5cba5d4f76bd7dbf65dc70
Binary files /dev/null and b/__pycache__/test_configuration_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_configuration_common.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_configuration_common.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c39b4039ab3a23eb71aeca023673cfd3ef4371da
Binary files /dev/null and b/__pycache__/test_configuration_common.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_data_collator.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_data_collator.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0335e638a83e594583fd9449bf3dd1623a28a80
Binary files /dev/null and b/__pycache__/test_data_collator.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_data_collator.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_data_collator.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cc7fd332b4535d1770ca6e1a386a4e433c2153f
Binary files /dev/null and b/__pycache__/test_data_collator.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_data_collator.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_data_collator.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f9ce1cde3aa3773a30261d0cb80288998a553d1
Binary files /dev/null and b/__pycache__/test_data_collator.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_doc_samples.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_doc_samples.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df430609960d6b7160add0874094b3852ae2e75f
Binary files /dev/null and b/__pycache__/test_doc_samples.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_doc_samples.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_doc_samples.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2813efc739e4500ea26a486362d9b11b95cd4cd
Binary files /dev/null and b/__pycache__/test_doc_samples.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_doc_samples.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_doc_samples.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2813efc739e4500ea26a486362d9b11b95cd4cd
Binary files /dev/null and b/__pycache__/test_doc_samples.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_auto.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_feature_extraction_auto.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d0beb2292bb65f55030a9e4cd917c27b9a630bc
Binary files /dev/null and b/__pycache__/test_feature_extraction_auto.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_feature_extraction_auto.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_auto.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7db57695ca4fef6922f9ecf7345a1ee2b6c9991f
Binary files /dev/null and b/__pycache__/test_feature_extraction_auto.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_feature_extraction_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb9149fc1fcab9afd9db20d3e7922ddf95c81549
Binary files /dev/null and b/__pycache__/test_feature_extraction_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_feature_extraction_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7233a40fc75f8db8cd0038016824204f1de5948b
Binary files /dev/null and b/__pycache__/test_feature_extraction_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_deit.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_deit.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c14d6f2489764d6b2e7dec3cce87e1756c4323c1
Binary files /dev/null and b/__pycache__/test_feature_extraction_deit.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_detr.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_feature_extraction_detr.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08daa59971d7c751eaedfe3cd902518739f94227
Binary files /dev/null and b/__pycache__/test_feature_extraction_detr.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_feature_extraction_detr.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_detr.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83e2270a2c6d6dd33a432903dd3e59e5035bb4ec
Binary files /dev/null and b/__pycache__/test_feature_extraction_detr.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_speech_to_text.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_feature_extraction_speech_to_text.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d74154a57994eeb8ce4687ed1c4e6ef5cb8fc254
Binary files /dev/null and b/__pycache__/test_feature_extraction_speech_to_text.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_feature_extraction_speech_to_text.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_speech_to_text.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26e398e6758533ec725fa8d6487006cb99e60486
Binary files /dev/null and b/__pycache__/test_feature_extraction_speech_to_text.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_vit.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_feature_extraction_vit.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef057ec39113e8ccd66bd3e2b316973f8a3f156c
Binary files /dev/null and b/__pycache__/test_feature_extraction_vit.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_feature_extraction_vit.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_vit.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6003fb8d755aeb23e79d7d7398f618ec93f04054
Binary files /dev/null and b/__pycache__/test_feature_extraction_vit.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_feature_extraction_wav2vec2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_feature_extraction_wav2vec2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94ab0a004749caeea77598341fe33b638a4aa818
Binary files /dev/null and b/__pycache__/test_feature_extraction_wav2vec2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_feature_extraction_wav2vec2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_feature_extraction_wav2vec2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94ab0a004749caeea77598341fe33b638a4aa818
Binary files /dev/null and b/__pycache__/test_feature_extraction_wav2vec2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_file_utils.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_file_utils.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cf9a5bac6ef1ee086252c7896414fa26bed8e40
Binary files /dev/null and b/__pycache__/test_file_utils.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_file_utils.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_file_utils.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f6d49cc159892b7b972230ec9bb3590c5da614
Binary files /dev/null and b/__pycache__/test_file_utils.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_file_utils.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_file_utils.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f6d49cc159892b7b972230ec9bb3590c5da614
Binary files /dev/null and b/__pycache__/test_file_utils.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_flax_auto.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_flax_auto.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..880e1a305d666a0371df30489b3dc48b08d5f592
Binary files /dev/null and b/__pycache__/test_flax_auto.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_flax_auto.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_flax_auto.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa13dd6de7ff4b43f00e28676b428ed5833570d9
Binary files /dev/null and b/__pycache__/test_flax_auto.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_flax_auto.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_flax_auto.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa13dd6de7ff4b43f00e28676b428ed5833570d9
Binary files /dev/null and b/__pycache__/test_flax_auto.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cc55341ef4932753bd26ef16eba0a5f58b647b0
Binary files /dev/null and b/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e42d09f5ece0a88ed083105489c759ce1eeb66d1
Binary files /dev/null and b/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3289515e9d771a81cf215e1d64d9a95ad9f30535
Binary files /dev/null and b/__pycache__/test_generation_beam_search.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0113202715ef756a28ab3931d3b8a3b7390e6ec2
Binary files /dev/null and b/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae5a991586d5ce3fde5f53a416cbf8ecd9bf41f9
Binary files /dev/null and b/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d3126b71ce3ed0fa21ef049b47156900d73ed5c
Binary files /dev/null and b/__pycache__/test_generation_logits_process.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_generation_stopping_criteria.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_generation_stopping_criteria.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0eec360641958ddbc53e43eef60e4ab393f367d
Binary files /dev/null and b/__pycache__/test_generation_stopping_criteria.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_generation_stopping_criteria.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_generation_stopping_criteria.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab31533e205d5f802ae4e3b8c508a24fe0a2d191
Binary files /dev/null and b/__pycache__/test_generation_stopping_criteria.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_generation_utils.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_generation_utils.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c24649c6d5671d18c2b0e31236638cecaa4353d
Binary files /dev/null and b/__pycache__/test_generation_utils.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_generation_utils.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_generation_utils.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a0ec3a2ac36d3f8fe1190fe30af41404f550aa2
Binary files /dev/null and b/__pycache__/test_generation_utils.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_generation_utils.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_generation_utils.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a833301b21f73206d3696168c7a54c8a79d1f609
Binary files /dev/null and b/__pycache__/test_generation_utils.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_generation_utils.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_generation_utils.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c6642034f814a2114a34afa834d30c9b41c7f36
Binary files /dev/null and b/__pycache__/test_generation_utils.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_hf_api.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_hf_api.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40a3f6dbaa9620da6a7ef2eeb13cafd296a258f6
Binary files /dev/null and b/__pycache__/test_hf_api.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_hf_api.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_hf_api.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17ad0f8f97e2518db90b26f12fdbc7194956d005
Binary files /dev/null and b/__pycache__/test_hf_api.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_hf_api.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_hf_api.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3d76b248c6ba629aac337694000121cac731363
Binary files /dev/null and b/__pycache__/test_hf_api.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80533b09c90275c1f511ba8233f1790cced3c1fd
Binary files /dev/null and b/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ab538623e1c79f33054d89f526a11356fb389b2
Binary files /dev/null and b/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ab538623e1c79f33054d89f526a11356fb389b2
Binary files /dev/null and b/__pycache__/test_hf_argparser.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_image_utils.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_image_utils.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afc0546f0cd5227b4e692e973c7e8c0cf0e37116
Binary files /dev/null and b/__pycache__/test_image_utils.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_image_utils.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_image_utils.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96749980e92ee8cb33af1343dcb99818b5253627
Binary files /dev/null and b/__pycache__/test_image_utils.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_logging.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_logging.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1730d4014b756d47939dad98a9272b94e3ff9ae4
Binary files /dev/null and b/__pycache__/test_logging.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_logging.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_logging.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fcd07f50888368b59ec9cf971ff588471e75a8e
Binary files /dev/null and b/__pycache__/test_logging.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_logging.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_logging.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66d3c6184887050baeeb9748ffff4d2bfeb8a252
Binary files /dev/null and b/__pycache__/test_logging.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_model_card.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_model_card.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..546575caa75f34df82af9f023463c50c0b5ecdc3
Binary files /dev/null and b/__pycache__/test_model_card.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_model_card.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_model_card.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cfc9f1d21fcbe2bd1f7ccf9bb29c0f9855f68c5
Binary files /dev/null and b/__pycache__/test_model_card.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_model_card.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_model_card.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cfc9f1d21fcbe2bd1f7ccf9bb29c0f9855f68c5
Binary files /dev/null and b/__pycache__/test_model_card.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_model_output.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_model_output.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e86f14dda8bd3642a060d16ed7c03d0595bff77b
Binary files /dev/null and b/__pycache__/test_model_output.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_model_output.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_model_output.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..796285788ac80d281a4c70ecceb537fd4c3842ec
Binary files /dev/null and b/__pycache__/test_model_output.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_model_output.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_model_output.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..796285788ac80d281a4c70ecceb537fd4c3842ec
Binary files /dev/null and b/__pycache__/test_model_output.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61448e32c8522e7c60d755586333093f05ecc52c
Binary files /dev/null and b/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c461dc7df7e302cf3042f7ccf1cbaef4254f8a6
Binary files /dev/null and b/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b4e8684cecb278d9d45ac0b3cc68f9d335b51f6
Binary files /dev/null and b/__pycache__/test_modeling_albert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36b89cd10ad60bdc3920f58130b2e23b61febbca
Binary files /dev/null and b/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38a6a705e6b6096c0c48fb81b3e1fbd387bd8e82
Binary files /dev/null and b/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..027d8cf330c9402adb9475161b67c6b75e16439d
Binary files /dev/null and b/__pycache__/test_modeling_auto.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c965929b857cb7478d70324829fe4ddd2c5cc09
Binary files /dev/null and b/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a9e744805b08ad436aaf14f46cdc19e0643a4f2
Binary files /dev/null and b/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a13dd8b80cfc63fb2200a8bf69437fd322f6c2ef
Binary files /dev/null and b/__pycache__/test_modeling_bart.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b304445756e827a3142b340dbc20807f51938fa
Binary files /dev/null and b/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aed47a7ee8eb8aec1b4411bcf5cb6b873b00985
Binary files /dev/null and b/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcfc34852be58ec89ed95fd4438e85b479530bf3
Binary files /dev/null and b/__pycache__/test_modeling_bert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5b84c496ca3b3b266b1d18526747453caa78549
Binary files /dev/null and b/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f45c519ebc35716f817fc6a7c17c566e83a15783
Binary files /dev/null and b/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bddc550b712b66d11aefb6cc3b28d3721a9f4703
Binary files /dev/null and b/__pycache__/test_modeling_bert_generation.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_big_bird.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_big_bird.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c83fd2d6b341f58a2fe34b10d54f2e7a49cfed2
Binary files /dev/null and b/__pycache__/test_modeling_big_bird.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_big_bird.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_big_bird.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b67ca13c8fb2fbbbab0f19dcc6a51bb1ac138d7
Binary files /dev/null and b/__pycache__/test_modeling_big_bird.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..738b26d7b944f2095da24bc833ae00f71253111d
Binary files /dev/null and b/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ef47dbaa2cb61a3001876ab14c86d2ef2a36310
Binary files /dev/null and b/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3377c77d4c4848850705eabf2918dd710e6c039c
Binary files /dev/null and b/__pycache__/test_modeling_blenderbot.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b97bf6f6640f3b41ebc17ea8eeb11a9228ade92
Binary files /dev/null and b/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1212de9fd4afd8f7135bb7960beb09cfc7571233
Binary files /dev/null and b/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a38a3c1e6b7ae2471f350df3e97c87f7b14e96c3
Binary files /dev/null and b/__pycache__/test_modeling_blenderbot_small.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..709f218b99660790a713c3825d834722a5cc1dd7
Binary files /dev/null and b/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd70fde711ecda2d79c4ae521fcad1ed834307cc
Binary files /dev/null and b/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd70fde711ecda2d79c4ae521fcad1ed834307cc
Binary files /dev/null and b/__pycache__/test_modeling_bort.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f69eb282e41e561ecb3e287b9d80e694cd983177
Binary files /dev/null and b/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a502212b5fe9ea10825bb5154fcb5dff873c320
Binary files /dev/null and b/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a502212b5fe9ea10825bb5154fcb5dff873c320
Binary files /dev/null and b/__pycache__/test_modeling_camembert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_common.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_common.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1ce2bf0e54833b9c9e768d5f14576c473caef1c
Binary files /dev/null and b/__pycache__/test_modeling_common.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_common.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_common.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb8f671a03e9f8da220ca0b86cc074e443487b9e
Binary files /dev/null and b/__pycache__/test_modeling_common.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b0fd66c4343939900e95de13ea4360c081a4d
Binary files /dev/null and b/__pycache__/test_modeling_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fe9db3968cc9860554f70138a4f300d5b30968f
Binary files /dev/null and b/__pycache__/test_modeling_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_common.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_modeling_common.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdc86275831663e2553ca4b6efdc6ce27d274f03
Binary files /dev/null and b/__pycache__/test_modeling_common.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f12af286b1baadf7bbeda502a7a70e210eb12050
Binary files /dev/null and b/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d2784cc8d91a1aed6fab103aae58a43c6acbbfb
Binary files /dev/null and b/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0874628fde96461d734744993720ac54ceb8759
Binary files /dev/null and b/__pycache__/test_modeling_convbert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53ab28b619f5e6292668e1c5b860350af74aee13
Binary files /dev/null and b/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b3ff24bd22f526a9f692cb2bc4a24af9da9f14c
Binary files /dev/null and b/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b3ff24bd22f526a9f692cb2bc4a24af9da9f14c
Binary files /dev/null and b/__pycache__/test_modeling_ctrl.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c1ce1bc9c9dad12593621b7599e92de4451e3f1
Binary files /dev/null and b/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..665f4b82134f8e66c9666f971c2614c57d2ef5c5
Binary files /dev/null and b/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..665f4b82134f8e66c9666f971c2614c57d2ef5c5
Binary files /dev/null and b/__pycache__/test_modeling_deberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_deberta_v2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_deberta_v2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8578fdefd72a32bcba75d076840a80bb0791639
Binary files /dev/null and b/__pycache__/test_modeling_deberta_v2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_deberta_v2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_deberta_v2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8578fdefd72a32bcba75d076840a80bb0791639
Binary files /dev/null and b/__pycache__/test_modeling_deberta_v2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_deit.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_deit.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65b7faf13999a4c9b4a768ca03f1af07bdcf7453
Binary files /dev/null and b/__pycache__/test_modeling_deit.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_deit.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_deit.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bbeaf04ba4af87aa21017203927935e74d92820
Binary files /dev/null and b/__pycache__/test_modeling_deit.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_detr.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_detr.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49d0e8d1ae8a4f0a7828568e91ab2733732e2bfc
Binary files /dev/null and b/__pycache__/test_modeling_detr.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_detr.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_detr.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c52e0f7e9558b74a2282bffaf0ce16bdeaf4dcd
Binary files /dev/null and b/__pycache__/test_modeling_detr.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e33e8b8432c1b7d9b2f7e3b24de3f56e0e40e86d
Binary files /dev/null and b/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd5c6e937b259b0943dbcea7aedd1599c0ec9b7f
Binary files /dev/null and b/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7a12fa868dc899ec5b3b9dc44c789b3b4c4d8f5
Binary files /dev/null and b/__pycache__/test_modeling_distilbert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0da7b0afea9f733beae5d89420e0cfa3c54910f2
Binary files /dev/null and b/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66475a0b2dcd2b55c14827c95dcbeb1630907fbb
Binary files /dev/null and b/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66475a0b2dcd2b55c14827c95dcbeb1630907fbb
Binary files /dev/null and b/__pycache__/test_modeling_dpr.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9be44cb4e4810866ffab0f803a526fd40a81f9d
Binary files /dev/null and b/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52b034794bc5cedceb8c8c56685b420e53cf66f3
Binary files /dev/null and b/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c906d86fb0fea3eb91ba84024147867cab907ef2
Binary files /dev/null and b/__pycache__/test_modeling_electra.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e63b473b77276c3617515b24fef80afbba7fee6
Binary files /dev/null and b/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6027140057e84bdbd200ba0f4fde4bd36ceff18
Binary files /dev/null and b/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53f3d614ca2c6bf1a8a36a318a5941ddf5ee4679
Binary files /dev/null and b/__pycache__/test_modeling_encoder_decoder.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03eb2f662a73931d08fc3a0905aea7a2f5e131be
Binary files /dev/null and b/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff1743cc2bba9097bd4c89f46c1f61bcc5cefd6b
Binary files /dev/null and b/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff1743cc2bba9097bd4c89f46c1f61bcc5cefd6b
Binary files /dev/null and b/__pycache__/test_modeling_flaubert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d63169f148c95a8b83c7f3842b0add34197d7a38
Binary files /dev/null and b/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5abe0cbbdd7394918e26a52bcb6d22c1a8346e33
Binary files /dev/null and b/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f9f28d86642085af03cfb965950272b2d59a70e
Binary files /dev/null and b/__pycache__/test_modeling_flax_bert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e69baf6e909a149aedcd77de1168e5b3088d788f
Binary files /dev/null and b/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0c42b64e16d7e268f4ef60c3bdd8a0287ca8962
Binary files /dev/null and b/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc307767263f581428d4194b824c23ccdbb6a373
Binary files /dev/null and b/__pycache__/test_modeling_flax_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_flax_electra.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_flax_electra.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd4f9d2402ffa09d2ec1c7186833c02cac133699
Binary files /dev/null and b/__pycache__/test_modeling_flax_electra.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00e0b086d16c769e7ad75053ffa545c602f9b9c0
Binary files /dev/null and b/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..206a3e42a3b07a396665d71619b623d1759a9367
Binary files /dev/null and b/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9b8792db36fa1bbea77b4b381491b6e3027ac1b
Binary files /dev/null and b/__pycache__/test_modeling_flax_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..355134e3fb46ccf995de078b8a1d0d040d4c7143
Binary files /dev/null and b/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a22f45f66cace08f3f77ec872f6fbc70f3af7ee
Binary files /dev/null and b/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aec7f602fbb777c570b2f45e1c2d1e6945e85f69
Binary files /dev/null and b/__pycache__/test_modeling_fsmt.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fb8bcf8205d19cc91f1729dfe174f3c0c5c9dc9
Binary files /dev/null and b/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf1659fb6a96c07f2f4d784c6e209a62959f016f
Binary files /dev/null and b/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da17de024c060e3bccc9a24977e1d9a04ea545e8
Binary files /dev/null and b/__pycache__/test_modeling_funnel.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c3a76e8dd504bfd2cbd74c6374902e546ece6be
Binary files /dev/null and b/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81439cc1437884d330e7444ba5cb26d2473ceed
Binary files /dev/null and b/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d2de083518c0b5f2f96b6416d8a4ac8a6345cd5
Binary files /dev/null and b/__pycache__/test_modeling_gpt2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_gpt_neo.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_gpt_neo.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb09dca130ad5d3ad1766277aa9b9c299b05786a
Binary files /dev/null and b/__pycache__/test_modeling_gpt_neo.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_gpt_neo.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_gpt_neo.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ec02370152a0fa935d8cfb5b1f78976e56a36d5
Binary files /dev/null and b/__pycache__/test_modeling_gpt_neo.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_ibert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_ibert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9118b4bd860c33c686bac2d535cd6b7e3cc4f0af
Binary files /dev/null and b/__pycache__/test_modeling_ibert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_ibert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_ibert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..459bb82c79469983a7cc7c3b78ebaa51e7690c0b
Binary files /dev/null and b/__pycache__/test_modeling_ibert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96cc362176087f344d0252b7d0809d0555098db6
Binary files /dev/null and b/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f36872cf76c4db1dfa1a1bea75f9e86a4cdee81
Binary files /dev/null and b/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6e67107464ea8d8f315600c64f7a2bca1fef3c2
Binary files /dev/null and b/__pycache__/test_modeling_layoutlm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_led.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_led.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..120262acc63dad0a5894ac13147a5688368736bc
Binary files /dev/null and b/__pycache__/test_modeling_led.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_led.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_led.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b30d7217a60ffd296671c52441987db58ece15fa
Binary files /dev/null and b/__pycache__/test_modeling_led.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_led.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_led.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d0767c736bdd0ce841419b654bd687aeb623ba5
Binary files /dev/null and b/__pycache__/test_modeling_led.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05986dcd91fd08490bfb94a5f00a2f1da03f3749
Binary files /dev/null and b/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c30229e0d99722f5061f36fafbc49b5179535528
Binary files /dev/null and b/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..318a5889ad1b670641644f1b9b9c8aa7f73523a3
Binary files /dev/null and b/__pycache__/test_modeling_longformer.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_luke.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_luke.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47b6c67ad6cecd9f26cf18dc51040295d6114599
Binary files /dev/null and b/__pycache__/test_modeling_luke.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1f2f2f2ee553ff3f44e81415515fde47ad02071
Binary files /dev/null and b/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa8a9208df6c49572db1ba34eb58e4426aeec2c4
Binary files /dev/null and b/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fd813e7f57481bf9f18ef7bfc7af7217ad8669a
Binary files /dev/null and b/__pycache__/test_modeling_lxmert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_m2m_100.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_m2m_100.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31404611230c10c217341c76c221beb8b6f52b63
Binary files /dev/null and b/__pycache__/test_modeling_m2m_100.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_m2m_100.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_m2m_100.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1183e9d62762eabf763747236f6b07f1c1fd6186
Binary files /dev/null and b/__pycache__/test_modeling_m2m_100.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a551abcac955ae49b2a436c56bbf43f1e07c7805
Binary files /dev/null and b/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d1f23d3868af715a5c2b21e695f198bf9a24dd4
Binary files /dev/null and b/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19f7aa3534b9d3caf5a0e15de153222d64082436
Binary files /dev/null and b/__pycache__/test_modeling_marian.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94653e4d3f190cab9f82bba48a7c6ffe94b5d653
Binary files /dev/null and b/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17cb8165fc7547ab3d4a975cf90b1c3e81d1742a
Binary files /dev/null and b/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..421969a590fe17cb34956f3d2ff3bbde636fe95c
Binary files /dev/null and b/__pycache__/test_modeling_mbart.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_megatron_bert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_megatron_bert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..558dc9e5b5b3ff8c72c46ec4aa8c520d74e191bb
Binary files /dev/null and b/__pycache__/test_modeling_megatron_bert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_megatron_bert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_megatron_bert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c3665340f90bd85c2c9f315ac77580eaec1f154
Binary files /dev/null and b/__pycache__/test_modeling_megatron_bert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f513ab453e0dbd56c5a90fd230ab2a9fecee65fb
Binary files /dev/null and b/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28256211748dd39df5464a58584b7b2e1125bab5
Binary files /dev/null and b/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de5f8ecc4295c4993cdc09fdff8ff3eafb1c541a
Binary files /dev/null and b/__pycache__/test_modeling_mobilebert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2ac28e496b832dc972399b3b9bf04d93be1d966
Binary files /dev/null and b/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10dd4d283573c905e7217a43832babd77be1782e
Binary files /dev/null and b/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10dd4d283573c905e7217a43832babd77be1782e
Binary files /dev/null and b/__pycache__/test_modeling_mpnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5d6343918a05e17ccb7723922c303391b1c9df9
Binary files /dev/null and b/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e41e470aaa66532b41ac0f88d04aa47b2d34ac18
Binary files /dev/null and b/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e41e470aaa66532b41ac0f88d04aa47b2d34ac18
Binary files /dev/null and b/__pycache__/test_modeling_mt5.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..867fb8aa802734c12ff65de7c1cfad24c7a577f5
Binary files /dev/null and b/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1cb9ea2c1b09750f1d6d95f8327d9f169a88f11
Binary files /dev/null and b/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1cb9ea2c1b09750f1d6d95f8327d9f169a88f11
Binary files /dev/null and b/__pycache__/test_modeling_openai.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc84e0c26efa3216c52d67352b9800e2fbe12c48
Binary files /dev/null and b/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..783aaec0ec6bdeb18bc76c470b52751593d79ca9
Binary files /dev/null and b/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6280f6b3ae61780662354228a8513fcd581d6c0a
Binary files /dev/null and b/__pycache__/test_modeling_pegasus.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..142b44decba8f7e75b0975fdba7fbc8b6729daa6
Binary files /dev/null and b/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df116274842cad3ae787bf02bf8b786d62ffc79e
Binary files /dev/null and b/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..929adf137bfa8d48f059059a14e672b531b44fa8
Binary files /dev/null and b/__pycache__/test_modeling_prophetnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ffdbf9d6c82cbe26aab5844d88380adaf381af3
Binary files /dev/null and b/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0d1048421f4c5230e6bb75d9e9e60aa9f1c8031
Binary files /dev/null and b/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fc418a19a1a1fcde3d660d754a6dbd4c8080a72
Binary files /dev/null and b/__pycache__/test_modeling_rag.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e4a16a9c52dd3cf178538da48f058c43d93e425
Binary files /dev/null and b/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d59ce80d751026277ba06126d3f95d9b02b9a10
Binary files /dev/null and b/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34ab5be40cf29d719fb6d35a10eb33f7153d0f79
Binary files /dev/null and b/__pycache__/test_modeling_reformer.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_rembert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_rembert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd154d3e65895f29296e0975bd25647a4629b907
Binary files /dev/null and b/__pycache__/test_modeling_rembert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b99fab52f5464fdb05441bf3f8f3ec85db13f915
Binary files /dev/null and b/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62cba19decedffea5ebcb0545c5040f3de7d568c
Binary files /dev/null and b/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d71502c6692843c057e8efb19f8f7a3d5c1cff7
Binary files /dev/null and b/__pycache__/test_modeling_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_speech_to_text.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_speech_to_text.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80549273f3a303a729896d0dfad582c95181ce74
Binary files /dev/null and b/__pycache__/test_modeling_speech_to_text.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_speech_to_text.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_speech_to_text.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3e76afc664f01b3e669553d73995167dceae864
Binary files /dev/null and b/__pycache__/test_modeling_speech_to_text.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92e5e9933357a7ecdc6d97fc34a4a1b3e0770b28
Binary files /dev/null and b/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a25a94cfb6bb24207e2a58d34333324fa84a319
Binary files /dev/null and b/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f1bcf54163ed38498999f3c25b8ec2eeb1e7b53
Binary files /dev/null and b/__pycache__/test_modeling_squeezebert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f213ad9e60206c19f4d4d93a3f92bf8f58de10f
Binary files /dev/null and b/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d3f8e3208e9a68f1125000a658e071935a59ccd
Binary files /dev/null and b/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c72c936a325dac0c71bad4a9b3ea48d0f434246
Binary files /dev/null and b/__pycache__/test_modeling_t5.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0314813fdefa4105028103a00285883f524a9ee
Binary files /dev/null and b/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cea6d2975125bf9f8b9e0bc13f8e225e8e23e10
Binary files /dev/null and b/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc87f05b6d428e531ff01e9ebc55cb77210b8b01
Binary files /dev/null and b/__pycache__/test_modeling_tapas.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_template_bi_encoder_bert.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_template_bi_encoder_bert.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f52e9d08096fa4771a69ecee989ff38e20c2445b
Binary files /dev/null and b/__pycache__/test_modeling_template_bi_encoder_bert.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_template_encoder_bert.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_template_encoder_bert.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77b974b591cbe53046d065c7d461947c6bb26ec2
Binary files /dev/null and b/__pycache__/test_modeling_template_encoder_bert.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_template_pt_encoder_bert.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_template_pt_encoder_bert.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ca67481c973d49f3b44704757a661af8d3b396f
Binary files /dev/null and b/__pycache__/test_modeling_template_pt_encoder_bert.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35000306e86a3e649e1869ee6435d3cf376d502f
Binary files /dev/null and b/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d7d5184e85d7fb77c68ff1d088234d05e0aac1d
Binary files /dev/null and b/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f2890386e454fa4c0898b44fab1457b4bd17d9b
Binary files /dev/null and b/__pycache__/test_modeling_tf_albert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9676c3e7f267842439ad0c68b3093e1123664f58
Binary files /dev/null and b/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3153a7d04d9b3643d74551c8bbc30b174d9225d8
Binary files /dev/null and b/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e90b1736f3c24a03c65b97f5c8048b42b8f997a0
Binary files /dev/null and b/__pycache__/test_modeling_tf_auto.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee30fa6ec785653a1215f4379c9b18a4db10602f
Binary files /dev/null and b/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df492fb182086b05f1be55e36b6edb30f94560e4
Binary files /dev/null and b/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84c9a39508f5b084f83233eb79ccc2983e1fb708
Binary files /dev/null and b/__pycache__/test_modeling_tf_bart.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1ed487b3dcac169c72e2c9989e7112ac986e276
Binary files /dev/null and b/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6e05e137ae5198936f6f90acb2be34d56d0d72d
Binary files /dev/null and b/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83db97f0ad2906bc8246ace91408190fbcec07cf
Binary files /dev/null and b/__pycache__/test_modeling_tf_bert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3248148329c732d4494057a3d2b500e1b3b9adac
Binary files /dev/null and b/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48fba2cf201e55fda47916bb5be4b4ee15bf45fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3faac110855d2246e57e9c33fc2947996e132425
Binary files /dev/null and b/__pycache__/test_modeling_tf_blenderbot.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51fc49a10ab8b29e5acfd77d2d760a92c51329a9
Binary files /dev/null and b/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9658f0fe86cf35676a27bff57f332179dd018d1f
Binary files /dev/null and b/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d139ac1962dcc3554fc423a51cceb59aa826f113
Binary files /dev/null and b/__pycache__/test_modeling_tf_blenderbot_small.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86aed6891f0810b7449955ced1a4e5e3c37495a6
Binary files /dev/null and b/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5da97a3896a1d70247a050c85ffcaf2080006c
Binary files /dev/null and b/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5da97a3896a1d70247a050c85ffcaf2080006c
Binary files /dev/null and b/__pycache__/test_modeling_tf_bort.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f9c1d2b053148275512e8bd6061d7c1cf4a2333
Binary files /dev/null and b/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68b12f5d7af8540ac8b3674078a6ad2efa6ab6eb
Binary files /dev/null and b/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68b12f5d7af8540ac8b3674078a6ad2efa6ab6eb
Binary files /dev/null and b/__pycache__/test_modeling_tf_camembert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cea27b0f2ae3ea3bff2cf905137befea96cd875
Binary files /dev/null and b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..509cc4b1d454cf4e64451c11a6769b4f2b2c4377
Binary files /dev/null and b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c510f34f3b97052286c2231be3c7601a4ad925f
Binary files /dev/null and b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ad370065879c99414f7a7898006e3a9ea4d98ec
Binary files /dev/null and b/__pycache__/test_modeling_tf_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99dbee53432b2943b7e10b0d51dc3151c78d9f98
Binary files /dev/null and b/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47b5aa5e0805b113ef30108c285c452a7c087a0c
Binary files /dev/null and b/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47b5aa5e0805b113ef30108c285c452a7c087a0c
Binary files /dev/null and b/__pycache__/test_modeling_tf_convbert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec599882a038c6dea982cacba5c2722e0de60b92
Binary files /dev/null and b/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08f82a0e1e5be24655f40ff7c59ca278eeac6087
Binary files /dev/null and b/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08f82a0e1e5be24655f40ff7c59ca278eeac6087
Binary files /dev/null and b/__pycache__/test_modeling_tf_ctrl.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6983cc25ba45c857bc250510d2df04a9cd35ced4
Binary files /dev/null and b/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d956ef3bf1bd63c4041ac2344e4d22232ca31a35
Binary files /dev/null and b/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d956ef3bf1bd63c4041ac2344e4d22232ca31a35
Binary files /dev/null and b/__pycache__/test_modeling_tf_distilbert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6de037df5512964e55511068c215d0629761fda3
Binary files /dev/null and b/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..935cabafaa41dc2df679589ebd46e49c853ab9fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..935cabafaa41dc2df679589ebd46e49c853ab9fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_dpr.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..291f81a02e315d1983f65b40006409d0ab5c45ec
Binary files /dev/null and b/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54da65f9cb222f0be8ac53a348c7edd600ee7e53
Binary files /dev/null and b/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54da65f9cb222f0be8ac53a348c7edd600ee7e53
Binary files /dev/null and b/__pycache__/test_modeling_tf_electra.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcc8001e5de993f0aca727460be08190eecde721
Binary files /dev/null and b/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..676ce45e2a581750440dbb7d1ec4d0cc8a2100c2
Binary files /dev/null and b/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..676ce45e2a581750440dbb7d1ec4d0cc8a2100c2
Binary files /dev/null and b/__pycache__/test_modeling_tf_flaubert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dd2f856fa63582fc4f3456feec63cedff36358d
Binary files /dev/null and b/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b11c683a342cf4238b73c9a20af28ad628e1ea86
Binary files /dev/null and b/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43cb0f57248ec6040f8927c220752fee0cc65be7
Binary files /dev/null and b/__pycache__/test_modeling_tf_funnel.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18d9993331278369a90ca631e20e220eed2bb2b7
Binary files /dev/null and b/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..269f4a158570c807b3d774556af83c55f36850dc
Binary files /dev/null and b/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..269f4a158570c807b3d774556af83c55f36850dc
Binary files /dev/null and b/__pycache__/test_modeling_tf_gpt2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_layoutlm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_layoutlm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..048fad1b7069879e4166cc8be75ee1d4eb60c7fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_layoutlm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_layoutlm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_layoutlm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9055f2c15434e173ce811710838608ae6ac22194
Binary files /dev/null and b/__pycache__/test_modeling_tf_layoutlm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa3ac165a85c593a5cb03641a11cc0e5b3ff8c79
Binary files /dev/null and b/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1961ada4e836f486442dd189a88df6b35f148721
Binary files /dev/null and b/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..089d595cbffa427af57a394c3c2dce458528ef7a
Binary files /dev/null and b/__pycache__/test_modeling_tf_led.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cab496adea52dfe88a7da293b5aae8d668e282f7
Binary files /dev/null and b/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00505f59a8aa1d633839e4b56747005a48104cba
Binary files /dev/null and b/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e474a41442092bac4a91e593abd221d0ed069fb
Binary files /dev/null and b/__pycache__/test_modeling_tf_longformer.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55682ceebf68af899d7d2f94867e70ef6f1ec872
Binary files /dev/null and b/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65c7ac944d921c941c647408b9c7e4b45efda596
Binary files /dev/null and b/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65c7ac944d921c941c647408b9c7e4b45efda596
Binary files /dev/null and b/__pycache__/test_modeling_tf_lxmert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e52d27929a0a7f370406f9e50ab5d4c13a878d74
Binary files /dev/null and b/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75eaf112bf3ec10282f876a10ca70f856041daa1
Binary files /dev/null and b/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e25ebd418ffb80f6dc162d55aaba3a850f91c5d
Binary files /dev/null and b/__pycache__/test_modeling_tf_marian.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..844951ac8a059665dc2462ed3543dee54e788a85
Binary files /dev/null and b/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e38e68cb660678b7d5ea0790b482d4a50abcd711
Binary files /dev/null and b/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49497baf00c93d86db49bd6a4abd37deb8bf5103
Binary files /dev/null and b/__pycache__/test_modeling_tf_mbart.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8874b1c4ea73d0edb450fbdd27b2b621d6be98df
Binary files /dev/null and b/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f804066b55ba2cce024bddd28dac019157fbadb
Binary files /dev/null and b/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f804066b55ba2cce024bddd28dac019157fbadb
Binary files /dev/null and b/__pycache__/test_modeling_tf_mobilebert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6a158d2360244492998e289e16c156a96b20eb4
Binary files /dev/null and b/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7423cdf481f881ecd32f7f9df0f25f08923025ec
Binary files /dev/null and b/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7423cdf481f881ecd32f7f9df0f25f08923025ec
Binary files /dev/null and b/__pycache__/test_modeling_tf_mpnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c95258eb9410a0c5891ee256c3167c1ca2817279
Binary files /dev/null and b/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a56a5f0ebf7264a93aa04f78af7fd1819192bae6
Binary files /dev/null and b/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a56a5f0ebf7264a93aa04f78af7fd1819192bae6
Binary files /dev/null and b/__pycache__/test_modeling_tf_mt5.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebde864a6a4ccc6019bc8e6a6fdf74670b51e4f0
Binary files /dev/null and b/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df5b0ec10aadc8bd8376e38d5e342e01ec548abc
Binary files /dev/null and b/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df5b0ec10aadc8bd8376e38d5e342e01ec548abc
Binary files /dev/null and b/__pycache__/test_modeling_tf_openai.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fdb76931bade0060f7fcb0a50eef9fd761bb7ae
Binary files /dev/null and b/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45150c61d53b6fc767db282bdf677efad06c0a65
Binary files /dev/null and b/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07729c4b1007b9dc15acde902b004992bd11ce49
Binary files /dev/null and b/__pycache__/test_modeling_tf_pegasus.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8da980af432ba7cfc87326cc4f59c04998732047
Binary files /dev/null and b/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..264eb0dbde62e6e7fc11e3890aa451562a3444fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..264eb0dbde62e6e7fc11e3890aa451562a3444fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_pytorch.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_rag.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_rag.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcbc1f94b94e7363bc99fd8815159466fedb76b3
Binary files /dev/null and b/__pycache__/test_modeling_tf_rag.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_rag.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_rag.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..732e8b5fa1724901613252baf95bb43c48682beb
Binary files /dev/null and b/__pycache__/test_modeling_tf_rag.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_rembert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_rembert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c795cff529447bb4ca7a5f6014032307434f05c1
Binary files /dev/null and b/__pycache__/test_modeling_tf_rembert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4fdf22636f8cc523254fe9eecb73832b16f0513
Binary files /dev/null and b/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..418371418b1bd3930526f4db29fad8ad3233e9d0
Binary files /dev/null and b/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..418371418b1bd3930526f4db29fad8ad3233e9d0
Binary files /dev/null and b/__pycache__/test_modeling_tf_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88eb4ddebfc44d631db4dcdbb072f4de29d6c339
Binary files /dev/null and b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4465f22425ae3e68fcdb93866d38dd97f646de70
Binary files /dev/null and b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f0b50154fc2e6c22ceb09484e27a29d52a1bd0c
Binary files /dev/null and b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab188a27e7d545829410732ca6c02052752b8e3a
Binary files /dev/null and b/__pycache__/test_modeling_tf_t5.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_template_bi_encoder_bert.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_tf_template_bi_encoder_bert.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bce95dfdd52e8d80ae318deeb410004d2ddefda2
Binary files /dev/null and b/__pycache__/test_modeling_tf_template_bi_encoder_bert.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_template_encoder_bert.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_tf_template_encoder_bert.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d06bd25462f6385c5269d70f07d3fdfd811c3132
Binary files /dev/null and b/__pycache__/test_modeling_tf_template_encoder_bert.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_template_tf_encoder_bert.cpython-38-pytest-6.1.2.pyc b/__pycache__/test_modeling_tf_template_tf_encoder_bert.cpython-38-pytest-6.1.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7846b64204473848cde64e2a9714c4e17ca7ede5
Binary files /dev/null and b/__pycache__/test_modeling_tf_template_tf_encoder_bert.cpython-38-pytest-6.1.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56a0e29cae916e1669a1a1586358710533154c3d
Binary files /dev/null and b/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43510c3f4de68a702bad09ee6fd8858fff775a19
Binary files /dev/null and b/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43510c3f4de68a702bad09ee6fd8858fff775a19
Binary files /dev/null and b/__pycache__/test_modeling_tf_transfo_xl.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bd4a25679f9502c51e3b0d3e27faad26125bafe
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0d1d8d6127d836ea92b301cfac579998327186d
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0d1d8d6127d836ea92b301cfac579998327186d
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc8931c8bd869cd767caa189980cbd7d5fecf5fd
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d47ef4169a92f821f638c1630218e3ea6b29d8c
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d47ef4169a92f821f638c1630218e3ea6b29d8c
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlm_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1204cfdd4e039feb5e77e136c388b5a1cedd3fc
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ebab263e1bab90d7ad9c96c123dc23505a0c206
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ebab263e1bab90d7ad9c96c123dc23505a0c206
Binary files /dev/null and b/__pycache__/test_modeling_tf_xlnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cbe3cf38b63c2045b4d426bc3bf5b4705042f4c
Binary files /dev/null and b/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34003ffc63b2bd62b6fd2423194cd3b87854dcc7
Binary files /dev/null and b/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34003ffc63b2bd62b6fd2423194cd3b87854dcc7
Binary files /dev/null and b/__pycache__/test_modeling_transfo_xl.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_vit.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_vit.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed14def9ee1d04e0ec0c4ce225c7c4af036e1723
Binary files /dev/null and b/__pycache__/test_modeling_vit.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_vit.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_vit.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a89d2f20d65702b096875cca73c940bd3ceaffba
Binary files /dev/null and b/__pycache__/test_modeling_vit.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b3f1feff2f65014d125e815d6b03b59d91db38f
Binary files /dev/null and b/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6403cfb69c76997b7e29e0fa7d8cac4af501fdb4
Binary files /dev/null and b/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2344ff0fe53cc54fe445c02545058960ca13f2e
Binary files /dev/null and b/__pycache__/test_modeling_wav2vec2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b24a4e2487aeb9b8714a4543b1f0e90111b2a7a
Binary files /dev/null and b/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d071464ef22eb9148fb9ce00a30f4dfa64d9ff44
Binary files /dev/null and b/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b8a4f818967da673b6cfb0e15ba5f78279ded93
Binary files /dev/null and b/__pycache__/test_modeling_xlm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57364606afa84fa9ebca3d695a727073e7307863
Binary files /dev/null and b/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e6dee9ae82ae3bd81db2c9a1ca86911234df0e1
Binary files /dev/null and b/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e6dee9ae82ae3bd81db2c9a1ca86911234df0e1
Binary files /dev/null and b/__pycache__/test_modeling_xlm_prophetnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57479d016e640f1cea64d312e4ef08b25ea1cb45
Binary files /dev/null and b/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f8b869ec439a0a8a6ffcd8a24635c5016641b49
Binary files /dev/null and b/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f8b869ec439a0a8a6ffcd8a24635c5016641b49
Binary files /dev/null and b/__pycache__/test_modeling_xlm_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aed3d84b4ab41741fec409f37e076060c211e3ed
Binary files /dev/null and b/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c5657aeedb137e8cfb118637f78f6d6597fb7a8
Binary files /dev/null and b/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce59678b8669476b96967a83dc8379551777850e
Binary files /dev/null and b/__pycache__/test_modeling_xlnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_modeling_xlnet.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_modeling_xlnet.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7afb1001a6642c1bfbf82546a5df63cb78a1419
Binary files /dev/null and b/__pycache__/test_modeling_xlnet.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_offline.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_offline.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05338be6716fd8eac6c083eb2227287e8ec5f7d4
Binary files /dev/null and b/__pycache__/test_offline.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_offline.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_offline.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05338be6716fd8eac6c083eb2227287e8ec5f7d4
Binary files /dev/null and b/__pycache__/test_offline.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_onnx.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_onnx.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8493c682a31c8ca6f8bb6bcc5eab5897f75ff39
Binary files /dev/null and b/__pycache__/test_onnx.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_onnx.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_onnx.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ef9faccd2d39f50c3965f55f7740c219212b799
Binary files /dev/null and b/__pycache__/test_onnx.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_onnx.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_onnx.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92118af5853156b7264a647b6d662f539c715088
Binary files /dev/null and b/__pycache__/test_onnx.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_optimization.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_optimization.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36c1b824de9dfebb25c367ca5186cb406f252781
Binary files /dev/null and b/__pycache__/test_optimization.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_optimization.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_optimization.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05670122704a3636f15f0dc519204c692268c35c
Binary files /dev/null and b/__pycache__/test_optimization.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_optimization.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_optimization.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fbe78223ee5b15eeb4c742bf23d89434d8cf019
Binary files /dev/null and b/__pycache__/test_optimization.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c15fd15f04186211cd6b7cad20722e977f391eb8
Binary files /dev/null and b/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..098bfa6d9d87242a7b542d3df9de8baae6e3c251
Binary files /dev/null and b/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7702424d298151ab6eb6bf4d894bb78a12f7900e
Binary files /dev/null and b/__pycache__/test_optimization_tf.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_automatic_speech_recognition.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_automatic_speech_recognition.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a0fbe2f36f004d44c94abe345e5732f141d079
Binary files /dev/null and b/__pycache__/test_pipelines_automatic_speech_recognition.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_automatic_speech_recognition.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_automatic_speech_recognition.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a0fbe2f36f004d44c94abe345e5732f141d079
Binary files /dev/null and b/__pycache__/test_pipelines_automatic_speech_recognition.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c15f52507d273928905f10656ef5f7cb4109eaae
Binary files /dev/null and b/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9a81435057c39593ce94910fa3839951b148380
Binary files /dev/null and b/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d097cabfc526b236ea07fde99d024f74cf3a6ab0
Binary files /dev/null and b/__pycache__/test_pipelines_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcc6e25f266bb5f04bb52c5f6ac091f17c4c9f05
Binary files /dev/null and b/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc4b40435af850f53299819e3697cbb87d4e6907
Binary files /dev/null and b/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e266e1320647163cd40be894064b0ed4ded38c91
Binary files /dev/null and b/__pycache__/test_pipelines_conversational.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13f3bfd41fcc8164e66fbaf1c69226785bac32c5
Binary files /dev/null and b/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e85cb671596091fec8be9f18340f90736d8ac68
Binary files /dev/null and b/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e85cb671596091fec8be9f18340f90736d8ac68
Binary files /dev/null and b/__pycache__/test_pipelines_feature_extraction.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a192203a057e39278b598c3b0fc1c238de18d876
Binary files /dev/null and b/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61e81574c115d89004ce6ce7b21651b1825ffe4f
Binary files /dev/null and b/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61e81574c115d89004ce6ce7b21651b1825ffe4f
Binary files /dev/null and b/__pycache__/test_pipelines_fill_mask.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_image_classification.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_image_classification.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e87837c9a72e0698ec469d4e24cbc7d61bcb7acb
Binary files /dev/null and b/__pycache__/test_pipelines_image_classification.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_image_classification.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_image_classification.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e2dbf3ff4103b0c2d4fed6bc0ccb1955b259065
Binary files /dev/null and b/__pycache__/test_pipelines_image_classification.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_ner.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_ner.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f00178e49ea43d24e68a25a1ec34b6ba741fcca4
Binary files /dev/null and b/__pycache__/test_pipelines_ner.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_ner.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_ner.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7726a345712e17fec4d3a9d9c4f8f6239bc19b80
Binary files /dev/null and b/__pycache__/test_pipelines_ner.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45df7f9950c83220cefbb2659efef5938a688aac
Binary files /dev/null and b/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e052b37a5a153b6822fba2f5f9ac3c6b8ab5140e
Binary files /dev/null and b/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93601d78be0dc8e6473415d49bbc4a079325a7b4
Binary files /dev/null and b/__pycache__/test_pipelines_question_answering.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_sentiment_analysis.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_sentiment_analysis.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e82eed2a603d480acec47fcc5e5fc6e72659eb0
Binary files /dev/null and b/__pycache__/test_pipelines_sentiment_analysis.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_sentiment_analysis.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_sentiment_analysis.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46c9f9c6e782738b78906ad9cc8eb45cc2dfd3bc
Binary files /dev/null and b/__pycache__/test_pipelines_sentiment_analysis.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8cdc9135b504f29ef93a3d4423efb575b356f781
Binary files /dev/null and b/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..530724da52f94bc7a22c27db16279bf6edafd53d
Binary files /dev/null and b/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9268a68948a2a6061f630ada314a916620bc8347
Binary files /dev/null and b/__pycache__/test_pipelines_summarization.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e34700bb4b18e71fb1b3c7407962c6c559d464b
Binary files /dev/null and b/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6766b7b7762b3771ca611fbf9a99278d6db87f15
Binary files /dev/null and b/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93822d50e1e62e25349912664dc18c9f0c5c4853
Binary files /dev/null and b/__pycache__/test_pipelines_table_question_answering.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f273f92fd2f0b22cb7f8c37dc6afc98bd15d7d5
Binary files /dev/null and b/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0015cefae503d7baad998f01fb2486568b6af459
Binary files /dev/null and b/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0015cefae503d7baad998f01fb2486568b6af459
Binary files /dev/null and b/__pycache__/test_pipelines_text2text_generation.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_text_classification.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_text_classification.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b89afcf74bda1f19282dc9258430e0632c22d9b6
Binary files /dev/null and b/__pycache__/test_pipelines_text_classification.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_text_classification.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_text_classification.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b89afcf74bda1f19282dc9258430e0632c22d9b6
Binary files /dev/null and b/__pycache__/test_pipelines_text_classification.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6973f5a917a779e0736daf6cbc2ba1de03087328
Binary files /dev/null and b/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f1327f28823d27943cc788d7fd4f4cc31930a8e
Binary files /dev/null and b/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f1327f28823d27943cc788d7fd4f4cc31930a8e
Binary files /dev/null and b/__pycache__/test_pipelines_text_generation.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_token_classification.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_token_classification.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a6184b7c3efd5d9c48a4e6fc2e6276505e1ebbb
Binary files /dev/null and b/__pycache__/test_pipelines_token_classification.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_token_classification.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_token_classification.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb44ad0fca0c610b71915da83ca45604ba54d686
Binary files /dev/null and b/__pycache__/test_pipelines_token_classification.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e488a5fa88336d57884325ef15e9cfdcec025837
Binary files /dev/null and b/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e418bba57b0b916b5d0abd5a54c27656fbde60a2
Binary files /dev/null and b/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d053de76374501d1c12e8f7da193067956e65aa2
Binary files /dev/null and b/__pycache__/test_pipelines_translation.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e32e1b055c0d11bd72e02c9fe0c78470c87a2cf
Binary files /dev/null and b/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d61855fbe611a166ae839ced4d21a5d45a9d03d4
Binary files /dev/null and b/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..255b7726b9f7dce9a727cdb1419e788c36d6b738
Binary files /dev/null and b/__pycache__/test_pipelines_zero_shot.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_processor_speech_to_text.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_processor_speech_to_text.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dae6dbf12a38c4c4184f9654780f5c6d7b057301
Binary files /dev/null and b/__pycache__/test_processor_speech_to_text.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_processor_speech_to_text.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_processor_speech_to_text.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c5465e7084c7437101caa23701f8038821f21fc
Binary files /dev/null and b/__pycache__/test_processor_speech_to_text.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_processor_wav2vec2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_processor_wav2vec2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2772920cd459e63268d7b9e5cd5d66f8eaedd8e0
Binary files /dev/null and b/__pycache__/test_processor_wav2vec2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_processor_wav2vec2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_processor_wav2vec2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2772920cd459e63268d7b9e5cd5d66f8eaedd8e0
Binary files /dev/null and b/__pycache__/test_processor_wav2vec2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..156220707c834227a5915cb61d1aeb7dabda1680
Binary files /dev/null and b/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0fa8f488b38b333dd2822af625069718bf3a1d
Binary files /dev/null and b/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3866a8a4dbe5df486259d5236590b15466dda5cf
Binary files /dev/null and b/__pycache__/test_retrieval_rag.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_sequence_feature_extraction_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_sequence_feature_extraction_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2373512bb0137aff12e01863771604b1ca5a13f2
Binary files /dev/null and b/__pycache__/test_sequence_feature_extraction_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_sequence_feature_extraction_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_sequence_feature_extraction_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2373512bb0137aff12e01863771604b1ca5a13f2
Binary files /dev/null and b/__pycache__/test_sequence_feature_extraction_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a5e0248128b9378418dafbfb245282389f141fe
Binary files /dev/null and b/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f569cc78a6a79b8d2446a4a05ec71594b765740
Binary files /dev/null and b/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e83bcb2b0d470074be1cb746151a523972388c8
Binary files /dev/null and b/__pycache__/test_skip_decorators.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9c9686be33692115ea624423d89bb1e4fb3cf21
Binary files /dev/null and b/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bbb41a427fad88b01552590ee730c406ab7de8e
Binary files /dev/null and b/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be3238a798f92a43474ec5afed9a37c2afe1abf5
Binary files /dev/null and b/__pycache__/test_tokenization_albert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_albert.cpython-38.pyc b/__pycache__/test_tokenization_albert.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b103599b18c5219d32b3a83860575174a25bc2c
Binary files /dev/null and b/__pycache__/test_tokenization_albert.cpython-38.pyc differ
diff --git a/__pycache__/test_tokenization_albert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_albert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98b1d85342aa91b3656a0e21e341694b0842d1ed
Binary files /dev/null and b/__pycache__/test_tokenization_albert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d71b33656367ba5846103c0d175e4366803f7e1
Binary files /dev/null and b/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5280dbdc755d347d14e003f727f98cb62e68bab6
Binary files /dev/null and b/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e33628920f4e169d277cf2d31c3747eb4202e826
Binary files /dev/null and b/__pycache__/test_tokenization_auto.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_auto.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_auto.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5eca865a4dd2ee1d6ef9d4174c33b1161cad00a5
Binary files /dev/null and b/__pycache__/test_tokenization_auto.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf93e59b84e92f1a832c7b6a97d86330335fdb0d
Binary files /dev/null and b/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac9d2a1998594fde7396539ed7a0a0ae30292e36
Binary files /dev/null and b/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79badee81eb2235836cfa813fb8c034b05881588
Binary files /dev/null and b/__pycache__/test_tokenization_bart.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bart.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bart.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b22601272ac62c79173ee1fbf9efb8f7b0dd9e33
Binary files /dev/null and b/__pycache__/test_tokenization_bart.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16aaa7cf05a2ee8fe974415a24e0498ae4a338a0
Binary files /dev/null and b/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd538cb3d251e363837afc2515f9230ef312a311
Binary files /dev/null and b/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd538cb3d251e363837afc2515f9230ef312a311
Binary files /dev/null and b/__pycache__/test_tokenization_barthez.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_barthez.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_barthez.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5258fe33a41ab3b26a4e437a051f39a88dec5e90
Binary files /dev/null and b/__pycache__/test_tokenization_barthez.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1176f22d104b38c122351e5ffe555ba2352c2b77
Binary files /dev/null and b/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21135dcfbb904d0e2447965e45af0397485108fe
Binary files /dev/null and b/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..297695b5f9d8ec5eb73cdfa114e4f484d9fc769a
Binary files /dev/null and b/__pycache__/test_tokenization_bert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcfda8bc92502cf43dbad3d13d4fac13b80b7b42
Binary files /dev/null and b/__pycache__/test_tokenization_bert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1aea36da1fd801216affdbe4f4ca294612c068bd
Binary files /dev/null and b/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67442dabd1e7f3b9097e9c7cc69fb89e1fba7f79
Binary files /dev/null and b/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67442dabd1e7f3b9097e9c7cc69fb89e1fba7f79
Binary files /dev/null and b/__pycache__/test_tokenization_bert_generation.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bert_generation.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bert_generation.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbf63ef2ee595e9368e2edb937afb4b2048e27bb
Binary files /dev/null and b/__pycache__/test_tokenization_bert_generation.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2a09d663695e0beb959a9755eb4809db1a9e539
Binary files /dev/null and b/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92c98e82497dfaee7b6db59f71f22b25c82495de
Binary files /dev/null and b/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92c98e82497dfaee7b6db59f71f22b25c82495de
Binary files /dev/null and b/__pycache__/test_tokenization_bert_japanese.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bert_japanese.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bert_japanese.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efa165beccd7ee6469ddbf077b6c9db6acdba72f
Binary files /dev/null and b/__pycache__/test_tokenization_bert_japanese.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..373209b39f60642ad386318287252b3be2d8c0a6
Binary files /dev/null and b/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18017204f64c8560ddcee4ac658c792f9e7784cd
Binary files /dev/null and b/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..200a0ad787f673bab3164b9de863eca4ad28abc0
Binary files /dev/null and b/__pycache__/test_tokenization_bertweet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_bertweet.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_bertweet.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0bc1d77e418f99da6e4bde5d5022ffbb245630c0
Binary files /dev/null and b/__pycache__/test_tokenization_bertweet.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_big_bird.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_big_bird.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7a226c5cb28382d2e105930c4cf17ecccc9e851
Binary files /dev/null and b/__pycache__/test_tokenization_big_bird.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_big_bird.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_big_bird.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..241f5f2f0cbde88e1456aba1874a7686f69e7051
Binary files /dev/null and b/__pycache__/test_tokenization_big_bird.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_big_bird.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_big_bird.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9609b047465a88470c13e9238c31eef70e00904
Binary files /dev/null and b/__pycache__/test_tokenization_big_bird.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2725e9508f11a27180b9d285b75b47b504a6024d
Binary files /dev/null and b/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f00a8e4e9920fc76752d4a5383fe9d2c101163ba
Binary files /dev/null and b/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f00a8e4e9920fc76752d4a5383fe9d2c101163ba
Binary files /dev/null and b/__pycache__/test_tokenization_blenderbot.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_blenderbot.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_blenderbot.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3358e8bfc64c96f3026d0cea96bccabfe03c2cab
Binary files /dev/null and b/__pycache__/test_tokenization_blenderbot.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9aa534a2f8c1ebe6604aafd82f44432378ec7e91
Binary files /dev/null and b/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9dc8fe0b43e6d0b9309f42e235b9d6c4afa4440
Binary files /dev/null and b/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9dc8fe0b43e6d0b9309f42e235b9d6c4afa4440
Binary files /dev/null and b/__pycache__/test_tokenization_camembert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_camembert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_camembert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd015efb910ea25dac07cc22cbe7287af03db2eb
Binary files /dev/null and b/__pycache__/test_tokenization_camembert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_clip.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_clip.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3408d509640e9f66c1e5e1246d9241ae2ab64f7
Binary files /dev/null and b/__pycache__/test_tokenization_clip.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b236db166a705fe413d3d7eef69ffd437eceafa7
Binary files /dev/null and b/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..834c3e5bf362a2ed399853db7b4846fd05646546
Binary files /dev/null and b/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d6d4d3fd9b59c7643ffe6e832460534dcb46de0
Binary files /dev/null and b/__pycache__/test_tokenization_common.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_common.cpython-38.pyc b/__pycache__/test_tokenization_common.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baba4353a3f8057b80259ae0ae921ec1b4cdcaec
Binary files /dev/null and b/__pycache__/test_tokenization_common.cpython-38.pyc differ
diff --git a/__pycache__/test_tokenization_common.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_common.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..433a70b7880b6a97a0727c0b41168c939981f071
Binary files /dev/null and b/__pycache__/test_tokenization_common.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_cpm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_cpm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0100ad048e97559e96aa5101b72470b27819dde
Binary files /dev/null and b/__pycache__/test_tokenization_cpm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_cpm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_cpm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab6daf1cbae69c0b513fa20e4c6beaf6f03d6269
Binary files /dev/null and b/__pycache__/test_tokenization_cpm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_cpm.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_cpm.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..804fd7db67313f22a3a1f92f2d81d03e58e6b02b
Binary files /dev/null and b/__pycache__/test_tokenization_cpm.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d08f7c5c5daac2f7618c3b9f6c6757258803172
Binary files /dev/null and b/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3759c9b2fd3c25b4735a58f9d0bf53887b5b13c3
Binary files /dev/null and b/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1538b1ef7c556852a07e2bbfaf63bc947555a14a
Binary files /dev/null and b/__pycache__/test_tokenization_ctrl.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_ctrl.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_ctrl.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9300571b8faa60bf2cc0db5c67cd425de73f7725
Binary files /dev/null and b/__pycache__/test_tokenization_ctrl.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee14bb8de1dc2286ab040c747ae4772988f2f83
Binary files /dev/null and b/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0be260cc49d9b29e4876a0aca5d8ef3bf968b099
Binary files /dev/null and b/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fca23200ae3c0448bf16a3a34c134e1d36277ee1
Binary files /dev/null and b/__pycache__/test_tokenization_deberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_deberta.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_deberta.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fb67d5c5c785f0ba59735c371a116c184d3dd01
Binary files /dev/null and b/__pycache__/test_tokenization_deberta.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_deberta_v2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_deberta_v2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be1d2597b1b2b080883518d839db97cb1f2738c7
Binary files /dev/null and b/__pycache__/test_tokenization_deberta_v2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_deberta_v2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_deberta_v2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be1d2597b1b2b080883518d839db97cb1f2738c7
Binary files /dev/null and b/__pycache__/test_tokenization_deberta_v2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_deberta_v2.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_deberta_v2.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79a5f82035af66de547a427b26aaa59e37ed0211
Binary files /dev/null and b/__pycache__/test_tokenization_deberta_v2.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd0eac01a5b485883ebf85751e3bca7e7bd5935f
Binary files /dev/null and b/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfe2931dd2ac4d0010c578a9e397a195d5384095
Binary files /dev/null and b/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfe2931dd2ac4d0010c578a9e397a195d5384095
Binary files /dev/null and b/__pycache__/test_tokenization_distilbert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_distilbert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_distilbert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee035399f62dc5557b3daee12775ea05e2619d78
Binary files /dev/null and b/__pycache__/test_tokenization_distilbert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ca60e81b777119e131b670f53f7cd3ed9352a0e
Binary files /dev/null and b/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e82ed95e660ed006011d61b0fbc5e23d441cc111
Binary files /dev/null and b/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e82ed95e660ed006011d61b0fbc5e23d441cc111
Binary files /dev/null and b/__pycache__/test_tokenization_dpr.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_dpr.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_dpr.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48cbcabc8f8ca0da6f9a04dfd3631b3283127747
Binary files /dev/null and b/__pycache__/test_tokenization_dpr.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b478e6e6b19da5f70fb9610b888782777f7e6ee2
Binary files /dev/null and b/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02f9ab7b4e6c47f7391f5657ed93322a8f88a716
Binary files /dev/null and b/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..016e4757130a4ad352c1ca0adb6050f7b33ada7d
Binary files /dev/null and b/__pycache__/test_tokenization_fsmt.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_fsmt.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_fsmt.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..658b2ebc3a4e4dbcdbc2d565d9aad30b972ae4aa
Binary files /dev/null and b/__pycache__/test_tokenization_fsmt.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c6e8bac94953cd39a83d41b4881ab0494d84fd8
Binary files /dev/null and b/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49b6374b57b9358bb4aebd34a107fe9ac86ad059
Binary files /dev/null and b/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49b6374b57b9358bb4aebd34a107fe9ac86ad059
Binary files /dev/null and b/__pycache__/test_tokenization_funnel.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_funnel.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_funnel.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ea3854e38d22685b2ae03c0c9635d03334709d9
Binary files /dev/null and b/__pycache__/test_tokenization_funnel.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85304fca23533daaa0b09bf24aaa3da80a9c6e2b
Binary files /dev/null and b/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..275e5a7a43a96051234ad67a275bdeaefb1db704
Binary files /dev/null and b/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b898a00fab04212a108496f8e32d61044a82bfa4
Binary files /dev/null and b/__pycache__/test_tokenization_gpt2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_gpt2.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_gpt2.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7abff544ef494187ea03c29e85399c258308192a
Binary files /dev/null and b/__pycache__/test_tokenization_gpt2.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccb798d6b069888cc7ab07324c77faadfb5804d7
Binary files /dev/null and b/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08f0135e6eeea0d0923df34ee268a218621d8ab0
Binary files /dev/null and b/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08f0135e6eeea0d0923df34ee268a218621d8ab0
Binary files /dev/null and b/__pycache__/test_tokenization_herbert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_herbert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_herbert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7d969dcdbf1fec59e687354cccd8d9cf48e1c56
Binary files /dev/null and b/__pycache__/test_tokenization_herbert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..693531905237b1d20ff52523f66b155fca16790d
Binary files /dev/null and b/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3bbe54c41bfe765cb054b26b6bd731df83f86b8
Binary files /dev/null and b/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1308e8e890fb5ddc638a1eedf830acf6708ccd18
Binary files /dev/null and b/__pycache__/test_tokenization_layoutlm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_layoutlm.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_layoutlm.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20db85540de3a6895fa4e901d05fca50d8f7b530
Binary files /dev/null and b/__pycache__/test_tokenization_layoutlm.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_luke.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_luke.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7415389546fe83fd69ebd679024e43f75f5e641a
Binary files /dev/null and b/__pycache__/test_tokenization_luke.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_luke.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_luke.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2da610c0218985714bc9cf82e0bd23aab1ca6edd
Binary files /dev/null and b/__pycache__/test_tokenization_luke.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c35003c0f619c1074f8b134cf1bb2e080f9c9ffa
Binary files /dev/null and b/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61d5f1fb25d21ce7050505e7252559f4197a5f6c
Binary files /dev/null and b/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61d5f1fb25d21ce7050505e7252559f4197a5f6c
Binary files /dev/null and b/__pycache__/test_tokenization_lxmert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_lxmert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_lxmert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39b1ba7c5c51d874efbaf78c9ea4174c898caa22
Binary files /dev/null and b/__pycache__/test_tokenization_lxmert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_m2m_100.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_m2m_100.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a36c6445a9c9cf1f0286ab546bc43c6a8471f963
Binary files /dev/null and b/__pycache__/test_tokenization_m2m_100.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_m2m_100.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_m2m_100.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62626cdccb905485a168f23649ced164f0082a3d
Binary files /dev/null and b/__pycache__/test_tokenization_m2m_100.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_m2m_100.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_m2m_100.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8609d996d60caa75564bae50f1afc1ca88d69ee9
Binary files /dev/null and b/__pycache__/test_tokenization_m2m_100.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8ee447ef28d2a7fcf6a1952a49212c5397656db
Binary files /dev/null and b/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b017c9037e021d06ff988fddcd953056a0442e76
Binary files /dev/null and b/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b017c9037e021d06ff988fddcd953056a0442e76
Binary files /dev/null and b/__pycache__/test_tokenization_marian.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_marian.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_marian.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90c2ebda6696c677cc966135969e111b4c69ccca
Binary files /dev/null and b/__pycache__/test_tokenization_marian.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55c7064e50351d9c4046bcd02a0676817cea9e96
Binary files /dev/null and b/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c34bec9327a5f498e134f5674ea72d34a817c07b
Binary files /dev/null and b/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77ff7448e8c7dbc9df6721db6eb00945d51e6417
Binary files /dev/null and b/__pycache__/test_tokenization_mbart.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_mbart.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_mbart.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c39f55031376d744585adf81689b1289195f2f42
Binary files /dev/null and b/__pycache__/test_tokenization_mbart.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_mbart50.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_mbart50.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c04c669f71a7696a7bfb211969418c022f729ea
Binary files /dev/null and b/__pycache__/test_tokenization_mbart50.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_mbart50.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_mbart50.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3904f364fc9eee8abf81f5efa7c1e02ac936f46d
Binary files /dev/null and b/__pycache__/test_tokenization_mbart50.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_mbart50.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_mbart50.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0565c05f3afa39fabd374f23a8312953dd3c8409
Binary files /dev/null and b/__pycache__/test_tokenization_mbart50.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cfc4ca15664106aac53b47fbb0520711c18521c
Binary files /dev/null and b/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49dc77a80666eb19c28ecdc0b513b6b7f8bc6e85
Binary files /dev/null and b/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49dc77a80666eb19c28ecdc0b513b6b7f8bc6e85
Binary files /dev/null and b/__pycache__/test_tokenization_mpnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_mpnet.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_mpnet.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77f2600a4349a978c92a4670f84a1f60fdbe5f46
Binary files /dev/null and b/__pycache__/test_tokenization_mpnet.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4485b3e73702312ab6c3a91c232c989b0bc75b44
Binary files /dev/null and b/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee8eab0aa3d64aef6bfa40add568fd57556812b8
Binary files /dev/null and b/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60ae866ed0ee02cf668fe916c9f3e3ca6161daa9
Binary files /dev/null and b/__pycache__/test_tokenization_openai.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_openai.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_openai.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9200053ce312e8e7a11b2c3b001645a86bcb6a01
Binary files /dev/null and b/__pycache__/test_tokenization_openai.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..510188a3ec3804101d5dbf0db1d0afa1f419584a
Binary files /dev/null and b/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94719be281ab8b239eb2b8fdae3976600ea8810f
Binary files /dev/null and b/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94719be281ab8b239eb2b8fdae3976600ea8810f
Binary files /dev/null and b/__pycache__/test_tokenization_pegasus.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_pegasus.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_pegasus.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4890deb43ae6e0c2b98325aa52591253af06112a
Binary files /dev/null and b/__pycache__/test_tokenization_pegasus.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a417060a09d74653dff3c3d13b7dc4a2bdcd5c1
Binary files /dev/null and b/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c20cb757be05e5412308533e5f59fda0552f08c
Binary files /dev/null and b/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2b7275a2eca2567eea0a2e35fe64e6a765e0db4
Binary files /dev/null and b/__pycache__/test_tokenization_phobert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_phobert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_phobert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9815fda2da3cd85695b9374c2265b974d0d370e
Binary files /dev/null and b/__pycache__/test_tokenization_phobert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4365241db14fe55ae30600be05008624332df293
Binary files /dev/null and b/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45b1557bb9ff40ce8a1c46d421127b656995dcd2
Binary files /dev/null and b/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45b1557bb9ff40ce8a1c46d421127b656995dcd2
Binary files /dev/null and b/__pycache__/test_tokenization_prophetnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_prophetnet.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_prophetnet.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd6de7dcbb679e9509d442829ba49acd2efe9602
Binary files /dev/null and b/__pycache__/test_tokenization_prophetnet.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09516b0cea950e61445722977aebee5cebac971c
Binary files /dev/null and b/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ea38700a28f6cb33711bd093dd0d151a14af670
Binary files /dev/null and b/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ea38700a28f6cb33711bd093dd0d151a14af670
Binary files /dev/null and b/__pycache__/test_tokenization_rag.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_rag.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_rag.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce0bc957bb42592220410d4d04c5bd2024fafb80
Binary files /dev/null and b/__pycache__/test_tokenization_rag.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ee6c93746c9c624341e21c531f8e91644bb864b
Binary files /dev/null and b/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dee46a349265fba45036dd7ebc118de6ddd5e0eb
Binary files /dev/null and b/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76338e5d589f746f899cdbbd4b1b3ccf4c6fa208
Binary files /dev/null and b/__pycache__/test_tokenization_reformer.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_reformer.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_reformer.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7142a78f5d45e155b2e3acc824fed2e67b66a7a4
Binary files /dev/null and b/__pycache__/test_tokenization_reformer.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da61df283c82970d95eb0afc2030221c0236ef5d
Binary files /dev/null and b/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f98a80942d8606bd3b3c63ee18c2328286e7f64
Binary files /dev/null and b/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53effadf6ceb4f1573f0d482a53a9d537122432e
Binary files /dev/null and b/__pycache__/test_tokenization_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_roberta.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_roberta.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ea37b141c938b45a43a3d00be1bfa7ceed91930
Binary files /dev/null and b/__pycache__/test_tokenization_roberta.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_roformer.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_roformer.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c482334986c6eb75a7f2841e3e756a279499d4f9
Binary files /dev/null and b/__pycache__/test_tokenization_roformer.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64b08a42eaa8a97c0b21f6b9333f34e2b428d548
Binary files /dev/null and b/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6295476c839d34a0b209aec55b52988635b46475
Binary files /dev/null and b/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6295476c839d34a0b209aec55b52988635b46475
Binary files /dev/null and b/__pycache__/test_tokenization_small_blenderbot.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_small_blenderbot.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_small_blenderbot.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe46436421f66556ec5bb7f8a74cbeb72094514e
Binary files /dev/null and b/__pycache__/test_tokenization_small_blenderbot.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_speech_to_text.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_speech_to_text.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0119a0cc7722af6c8b658c8f269d0da982839751
Binary files /dev/null and b/__pycache__/test_tokenization_speech_to_text.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_speech_to_text.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_speech_to_text.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c89073820def64b283e70a9dbe23c7989be8729
Binary files /dev/null and b/__pycache__/test_tokenization_speech_to_text.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_speech_to_text.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_speech_to_text.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e427af32e30f04a64108e8dd28bf06dc605c3289
Binary files /dev/null and b/__pycache__/test_tokenization_speech_to_text.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fec4c98f1671f904a266b7c3cce98933a82c0a3
Binary files /dev/null and b/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..887e12b5a009355bb45342cae4e5d03a8a9c088c
Binary files /dev/null and b/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..887e12b5a009355bb45342cae4e5d03a8a9c088c
Binary files /dev/null and b/__pycache__/test_tokenization_squeezebert.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_squeezebert.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_squeezebert.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6273f19022a714889020ec38e2631adce6cb1d0a
Binary files /dev/null and b/__pycache__/test_tokenization_squeezebert.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dc4441e853ad70e10682b18e43934f704dbf3ab
Binary files /dev/null and b/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5e941953643d804c6c4d08f1371fc374f6e37b2
Binary files /dev/null and b/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ec383f81e83497f5931f5359deb4039a9c185fd
Binary files /dev/null and b/__pycache__/test_tokenization_t5.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_t5.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_t5.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f4f73f517d2cf543378a9b70c6a5658b0330ad3
Binary files /dev/null and b/__pycache__/test_tokenization_t5.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4b524fc8625e7e8ea03d0a10e94b06ef7b77ec9
Binary files /dev/null and b/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8afda071c4091b01bb8e7388c65cb787e9565d70
Binary files /dev/null and b/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d921b9565e86d3d1dc11dad5cac2376d1afc0131
Binary files /dev/null and b/__pycache__/test_tokenization_tapas.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_tapas.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_tapas.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8eeba22f5e6ee9a876df5c4a5f01eb622500131
Binary files /dev/null and b/__pycache__/test_tokenization_tapas.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc4525a28817719e3a1d235401e8a22eb753e96c
Binary files /dev/null and b/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b758c6fe7b084a8319cf378a9c82e059d566f56
Binary files /dev/null and b/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c601e39416aacf3e4d0979a298ba77d2b2535f
Binary files /dev/null and b/__pycache__/test_tokenization_transfo_xl.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_transfo_xl.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_transfo_xl.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c22c2ae7f1a39c0db028f5adeefc6f25b9e08753
Binary files /dev/null and b/__pycache__/test_tokenization_transfo_xl.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46111ea810ef3ad38cdf2be494b97c5e36c90fdf
Binary files /dev/null and b/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..936fbc65addde30b03581d9820cd4ac7e55df4ac
Binary files /dev/null and b/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db975551c238018b4fb509bf36c3b4a53bee335
Binary files /dev/null and b/__pycache__/test_tokenization_utils.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_utils.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_utils.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fc4c5c8f88f654cef73b0332bb7f2b19ddf4ce2
Binary files /dev/null and b/__pycache__/test_tokenization_utils.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6080085417f18787ce9556c6caab0946a31cc142
Binary files /dev/null and b/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..626361719903def20fd59dc2732a3750f4778574
Binary files /dev/null and b/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..572c5e7941297cb38232e72df640de9bcdc736d6
Binary files /dev/null and b/__pycache__/test_tokenization_wav2vec2.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_wav2vec2.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_wav2vec2.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbfce155d422aa7c421db6cf193d319255e7845e
Binary files /dev/null and b/__pycache__/test_tokenization_wav2vec2.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d45c88e7360485b3a78d743b5648eb28e76c2220
Binary files /dev/null and b/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8e6803eb78d70afda3d2c66f54ba0d933fba55e
Binary files /dev/null and b/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7a50db7a4ec94dcef9b874e8b33838ed87e1699
Binary files /dev/null and b/__pycache__/test_tokenization_xlm.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlm.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlm.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..890e6073649f51d7d55502335849ae7e00c2f4db
Binary files /dev/null and b/__pycache__/test_tokenization_xlm.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef7cb61fc502448097001d52b1273b50881289c6
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b01a4138d0fbcc54c5da3b066625097b52d066bf
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b01a4138d0fbcc54c5da3b066625097b52d066bf
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_prophetnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_prophetnet.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlm_prophetnet.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dd5f277ad2bb859e314bd1317efac648d3c7771
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_prophetnet.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a9033102312007d77ed97811dea767d329f5a1d
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de5e5ba8d04a3326e70dbc284e93acb8a5d1b1a7
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70c71d3ca080a5f34961c6161a4c0e9a717c3cad
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_roberta.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlm_roberta.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlm_roberta.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39674f83a4233995f6483d2891c6818853328447
Binary files /dev/null and b/__pycache__/test_tokenization_xlm_roberta.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f485af7ef1a7da164a51c80894e91c5bfda6e294
Binary files /dev/null and b/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..516a57c2cd8d782c91e1c2554699a63a453d560f
Binary files /dev/null and b/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..516a57c2cd8d782c91e1c2554699a63a453d560f
Binary files /dev/null and b/__pycache__/test_tokenization_xlnet.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_tokenization_xlnet.cpython-39-pytest-6.2.4.pyc b/__pycache__/test_tokenization_xlnet.cpython-39-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a116c86a13ff0cd959784f3f31dc2f389076515
Binary files /dev/null and b/__pycache__/test_tokenization_xlnet.cpython-39-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_trainer.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_trainer.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cad417755984243282ad5ff9ab2081a1c95ed0a8
Binary files /dev/null and b/__pycache__/test_trainer.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_trainer.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_trainer.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fab76a49c053ec015cbe548d49d29f30e37afe8
Binary files /dev/null and b/__pycache__/test_trainer.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_trainer.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_trainer.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d83082cd6c2a89f54273b5ac649981159a095b6
Binary files /dev/null and b/__pycache__/test_trainer.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efe876699b2821f3b37aa37bf736337d5636e3cd
Binary files /dev/null and b/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ae0dd393c70024e4553b3628d417cec623abf6b
Binary files /dev/null and b/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88f0d8391022e1223c99fde8843d65ae90cf6759
Binary files /dev/null and b/__pycache__/test_trainer_callback.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e2b19570e4877d88b65c651d4bd8c77ea230e6
Binary files /dev/null and b/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc8b00cf27df78bf5b9dc8d0f20765e387a13924
Binary files /dev/null and b/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f428407add9faae0d29273af844fd432d7276760
Binary files /dev/null and b/__pycache__/test_trainer_distributed.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb929ba5398ef9d0dbca0cbdd005db2e85120559
Binary files /dev/null and b/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79464172d675ed8051bb31e3897f74521d0905ab
Binary files /dev/null and b/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00afdc34fd88b99d1a5bb3aa0810f73fb1fe1df8
Binary files /dev/null and b/__pycache__/test_trainer_seq2seq.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..254eda6840626641db85edccfd0350aa700943a0
Binary files /dev/null and b/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b0939a1fd7c52f6cf6f7947025f49c305cd8b14
Binary files /dev/null and b/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..569dae17b72c53fe4c2061a41cba945919e7b95b
Binary files /dev/null and b/__pycache__/test_trainer_tpu.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c753f67cb0c4910c5b9e0e343752d341619c387
Binary files /dev/null and b/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aaab01aae086f003c9351ae74001fc7b69a15ab6
Binary files /dev/null and b/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2ba3546807615991261cc3809e3a8a0923b1cfb
Binary files /dev/null and b/__pycache__/test_trainer_utils.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9927cbca5c01a810fab69107f62e5f0490c850ea
Binary files /dev/null and b/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbb1771d25041bf3b325d9ce79a5de41a768b1d0
Binary files /dev/null and b/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcce97240096df92aedf880cc090f930899d9026
Binary files /dev/null and b/__pycache__/test_utils_check_copies.cpython-38-pytest-6.2.4.pyc differ
diff --git a/__pycache__/test_versions_utils.cpython-38-pytest-6.2.0.pyc b/__pycache__/test_versions_utils.cpython-38-pytest-6.2.0.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43268099c8592bcbe5d515d4bd43e7ff46e73a90
Binary files /dev/null and b/__pycache__/test_versions_utils.cpython-38-pytest-6.2.0.pyc differ
diff --git a/__pycache__/test_versions_utils.cpython-38-pytest-6.2.2.pyc b/__pycache__/test_versions_utils.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e722e91b3dd99d06de1b4f759eef72a3032b9c3e
Binary files /dev/null and b/__pycache__/test_versions_utils.cpython-38-pytest-6.2.2.pyc differ
diff --git a/__pycache__/test_versions_utils.cpython-38-pytest-6.2.4.pyc b/__pycache__/test_versions_utils.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e4e869e4a52824e49df4032ed1367d2b9162125
Binary files /dev/null and b/__pycache__/test_versions_utils.cpython-38-pytest-6.2.4.pyc differ
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e0bb6172db9942940414985236269a5eb41552c
--- /dev/null
+++ b/config.json
@@ -0,0 +1,21 @@
+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertModel"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 2,
+  "dropout": 0.1,
+  "hidden_dim": 8,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 2,
+  "n_layers": 2,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.7.0.dev0",
+  "vocab_size": 4
+}
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c5f161436dceaaf0524840497267666f0cd9aea
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,55 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tests directory-specific settings - this file is run automatically
+# by pytest before any tests are run
+
+import sys
+import warnings
+from os.path import abspath, dirname, join
+
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
+sys.path.insert(1, git_repo_path)
+
+# silence FutureWarning warnings in tests since often we can't act on them until
+# they become normal warnings - i.e. the tests still need to test the current functionality
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipeline are tested")
+    config.addinivalue_line(
+        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
+    )
+    config.addinivalue_line(
+        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
+    )
+    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
+
+
+def pytest_addoption(parser):
+    from transformers.testing_utils import pytest_addoption_shared
+
+    pytest_addoption_shared(parser)
+
+
+def pytest_terminal_summary(terminalreporter):
+    from transformers.testing_utils import pytest_terminal_summary_main
+
+    make_reports = terminalreporter.config.getoption("--make-reports")
+    if make_reports:
+        pytest_terminal_summary_main(terminalreporter, id=make_reports)
diff --git a/deepspeed/__pycache__/test_deepspeed.cpython-38-pytest-6.2.2.pyc b/deepspeed/__pycache__/test_deepspeed.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..191396ef3ed9910971d5e82bb8710e8882879d5d
Binary files /dev/null and b/deepspeed/__pycache__/test_deepspeed.cpython-38-pytest-6.2.2.pyc differ
diff --git a/deepspeed/__pycache__/test_deepspeed.cpython-38-pytest-6.2.4.pyc b/deepspeed/__pycache__/test_deepspeed.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..788887af17321ead309d331d08b58f48a9f1dfa4
Binary files /dev/null and b/deepspeed/__pycache__/test_deepspeed.cpython-38-pytest-6.2.4.pyc differ
diff --git a/deepspeed/ds_config_zero2.json b/deepspeed/ds_config_zero2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dec097dd19887fe344411ae4185081254d9f460f
--- /dev/null
+++ b/deepspeed/ds_config_zero2.json
@@ -0,0 +1,50 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/deepspeed/ds_config_zero3.json b/deepspeed/ds_config_zero3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a80a173b7a9704b8e270956564c787f4fd44ec8e
--- /dev/null
+++ b/deepspeed/ds_config_zero3.json
@@ -0,0 +1,57 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/deepspeed/test_deepspeed.py b/deepspeed/test_deepspeed.py
new file mode 100644
index 0000000000000000000000000000000000000000..74a2928c3ecce180406cb85c3ecd8dd6f684b14d
--- /dev/null
+++ b/deepspeed/test_deepspeed.py
@@ -0,0 +1,896 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import io
+import json
+import os
+import unittest
+from copy import deepcopy
+
+from parameterized import parameterized
+from transformers import AutoModel, TrainingArguments, is_torch_available, logging
+from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available
+from transformers.file_utils import WEIGHTS_NAME
+from transformers.testing_utils import (
+    CaptureLogger,
+    CaptureStderr,
+    ExtendSysPath,
+    LoggingLevel,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    mockenv_context,
+    require_deepspeed,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+from transformers.trainer_utils import set_seed
+
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+
+    if is_torch_available():
+        from test_trainer import RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer  # noqa
+
+
+set_seed(42)
+MBART_TINY = "sshleifer/tiny-mbart"
+T5_SMALL = "t5-small"
+T5_TINY = "patrickvonplaten/t5-tiny-random"
+
+
+def load_json(path):
+    with open(path) as f:
+        return json.load(f)
+
+
+def require_deepspeed_aio(test_case):
+    """
+    Decorator marking a test that requires deepspeed aio (nvme)
+    """
+    if not is_deepspeed_available():
+        return unittest.skip("test requires deepspeed")(test_case)
+
+    import deepspeed
+    from deepspeed.ops.aio import AsyncIOBuilder
+
+    if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
+        return unittest.skip("test requires deepspeed async-io")(test_case)
+    else:
+        return test_case
+
+
+if is_deepspeed_available():
+    from deepspeed.utils import logger as deepspeed_logger  # noqa
+    from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
+
+ZERO2 = "zero2"
+ZERO3 = "zero3"
+stages = [ZERO2, ZERO3]
+
+
+@require_deepspeed
+@require_torch_gpu
+class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
+    """
+    Testing non-Trainer DeepSpeed integration
+    """
+
+    def setUp(self):
+        super().setUp()
+
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+
+    def test_init_zero3(self):
+        # test that zero.Init() works correctly under zero3
+        ds_config = {
+            "train_batch_size": 1,
+            "zero_optimization": {
+                "stage": 3,
+            },
+        }
+
+        dschf = HfDeepSpeedConfig(ds_config)
+
+        self.assertTrue(dschf.is_zero3())
+        self.assertTrue(is_deepspeed_zero3_enabled())
+
+        with LoggingLevel(logging.INFO):
+            with mockenv_context(**self.dist_env_1_gpu):
+                logger = logging.get_logger("transformers.modeling_utils")
+                with CaptureLogger(logger) as cl:
+                    AutoModel.from_pretrained(T5_TINY)
+        self.assertIn("Detected DeepSpeed ZeRO-3", cl.out)
+
+        # now remove zero optimization
+        del ds_config["zero_optimization"]
+        dschf = HfDeepSpeedConfig(ds_config)
+
+        self.assertFalse(dschf.is_zero3())
+        self.assertFalse(is_deepspeed_zero3_enabled())
+
+        with LoggingLevel(logging.INFO):
+            with mockenv_context(**self.dist_env_1_gpu):
+                logger = logging.get_logger("transformers.modeling_utils")
+                with CaptureLogger(logger) as cl:
+                    AutoModel.from_pretrained(T5_TINY)
+        self.assertNotIn("Detected DeepSpeed ZeRO-3", cl.out)
+
+
+@require_deepspeed
+@require_torch_gpu
+class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
+    """
+
+    This class is for testing directly via get_regression_trainer
+
+    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods
+    which we can re-use here.
+
+    Important: this class' setup can only work with a single gpu because it runs within the current
+    pytest worker. For multi-gpu tests use TestDeepSpeedWithLauncher.
+
+    Note: if any of the tests of this class get run there will be at least one gpu occupied by them
+    until this pytest worker exits. This is because the gpu memory allocated by the cuda-kernels
+    won't be released until this pytest worker exits.
+
+    This may appear as some run-away tests if you watch `nvidia-smi` while other tests that fork new
+    processes are run. So there will be one or two "stale" processes reported in `nvidia-smi`. This
+    is not a bug.
+    """
+
+    def setUp(self):
+        super().setUp()
+
+        args = TrainingArguments(".")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+
+        self.ds_config_file = dict(
+            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
+            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
+        )
+
+        # use self.get_config_dict(stage) to use these to ensure the original is not modified
+        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
+            config_zero2 = json.load(f)
+            # by default use fp16
+            config_zero2["fp16"]["enabled"] = True
+        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
+            config_zero3 = json.load(f)
+            # by default use fp16
+            config_zero3["fp16"]["enabled"] = True
+            # This setting slows things down, so don't enable it by default unless needed by a test.
+            # It's in the file as a demo for users since we want everything to work out of the box even if slower.
+            config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+        self.ds_config_dict = dict(
+            zero2=config_zero2,
+            zero3=config_zero3,
+        )
+
+    def get_config_dict(self, stage):
+        # As some tests modify the dict, always make a copy
+        return deepcopy(self.ds_config_dict[stage])
+
+    # --- These tests are enough to run on one of zero stages --- #
+
+    def test_hf_ds_config_mismatch(self):
+
+        ds_config = self.get_config_dict(ZERO2)
+
+        # Purposefully configure these values to mismatch TrainingArguments values.
+        # This currently doesn't cover all keys (but it could)
+        per_device_train_batch_size = 2
+        ds_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size + 2
+
+        ds_config["train_batch_size"] = 1000
+
+        gradient_accumulation_steps = 2
+        ds_config["gradient_accumulation_steps"] = gradient_accumulation_steps + 2
+
+        max_grad_norm = 1.0
+        ds_config["gradient_clipping"] = max_grad_norm + 0.1
+
+        adam_beta1, adam_beta2 = 0.9, 0.99
+        ds_config["optimizer"]["params"]["betas"] = [adam_beta1 - 0.1, adam_beta2 - 0.1]
+
+        fp16 = True
+        ds_config["fp16"]["enabled"] = not fp16
+
+        keys = [
+            "per_device_train_batch_size",
+            "train_batch_size",
+            "gradient_accumulation_steps",
+            "max_grad_norm",
+            "betas",
+            "fp16",
+        ]
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(
+                local_rank=0,
+                fp16=fp16,
+                deepspeed=ds_config,
+                per_device_train_batch_size=per_device_train_batch_size,
+                gradient_accumulation_steps=gradient_accumulation_steps,
+                max_grad_norm=max_grad_norm,
+                adam_beta1=adam_beta1,
+                adam_beta2=adam_beta2,
+            )
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+
+        for key in keys:
+            self.assertTrue(
+                key in str(context.exception),
+                f"{key} is not in the exception message:\n{context.exception}",
+            )
+
+    # Test various combos
+    # 1. DS scheduler + DS optimizer: this is already tested by most other tests
+    # 2. HF scheduler + HF optimizer:
+    # 3. DS scheduler + HF optimizer:
+    # 4. HF scheduler + DS optimizer:
+
+    def test_hf_scheduler_hf_optimizer(self):
+        a = 0
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer.train()
+        new_a = trainer.model.a.item()
+        self.assertNotEqual(new_a, a)
+
+    def test_ds_scheduler_hf_optimizer(self):
+        a = 0
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer.train()
+        new_a = trainer.model.a.item()
+        self.assertNotEqual(new_a, a)
+
+    def test_hf_scheduler_ds_optimizer(self):
+        # this combo is not possible at the moment
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero2_dict = self.get_config_dict(ZERO2)
+            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
+            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertTrue(
+            "HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception),
+            f"got exception: {context.exception}",
+        )
+
+    @require_deepspeed_aio
+    def test_stage3_nvme_offload(self):
+        with mockenv_context(**self.dist_env_1_gpu):
+            # this actually doesn't have to be on NVMe, any storage will do since this test only
+            # runs a simple check that we can use some directory as if it were NVMe
+            nvme_path = self.get_auto_remove_tmp_dir()
+            nvme_config = dict(device="nvme", nvme_path=nvme_path)
+            ds_config_zero3_dict = self.get_config_dict(ZERO3)
+            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
+            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
+            with CaptureLogger(deepspeed_logger) as cl:
+                trainer.train()
+            self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
+
+    # --- These tests need to run on both zero stages --- #
+
+    @parameterized.expand(stages)
+    def test_hf_optimizer_with_offload(self, stage):
+        # must not allow non-DS optimizer when using ZERO-offload
+        ds_config_dict = self.get_config_dict(stage)
+        del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+        # force cpu offload
+        ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+            self.assertIn(
+                "ZeRO Offload can only work with DeepSpeed optimizers",
+                str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
+    @parameterized.expand(stages)
+    def test_fake_notebook_no_launcher(self, stage):
+        # this setup emulates a notebook where a launcher needs to be emulated by hand
+
+        # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
+        # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
+        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
+        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage))
+            with CaptureLogger(deepspeed_logger) as cl:
+                trainer.train()
+            self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
+
+    @parameterized.expand(stages)
+    def test_early_get_last_lr(self, stage):
+        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
+        # not run for the first few dozen steps while loss scale is too large, and thus during
+        # that time `get_last_lr` will fail if called during that warm up stage,
+        #
+        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
+        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
+        with mockenv_context(**self.dist_env_1_gpu):
+            a = b = 0.0
+            trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=8,
+                fp16=True,
+                deepspeed=self.get_config_dict(stage),
+                per_device_train_batch_size=8,
+                logging_steps=1,
+            )
+            trainer.train()
+            post_train_a = trainer.model.a.item()
+
+            # XXX: for some reason the following check fails with zero3 - not a broken but a
+            # different qualitative outcome - as if optimizer did run
+            # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
+            # print(trainer.model.a.item())
+            # print(trainer.model.b.item())
+            # need to investigate at some point
+            if stage == ZERO3:
+                return
+
+            # it's enough that train didn't fail for this test, but we must check that
+            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
+            self.assertEqual(post_train_a, a)
+
+    @parameterized.expand(stages)
+    def test_gradient_accumulation(self, stage):
+        # this test measures that we get identical weights and similar loss with:
+        # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
+        # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
+        # since the 2nd should produce the effective batch of 1st, with the same results
+        #
+        # I can get an identical loss for a small train_len=32, plus the power of the initial
+        # dynamic loss scale value set to:
+        #   "fp16.initial_scale_power": 1
+        # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
+        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
+        # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
+
+        train_len = 64
+        a = b = 0.0
+
+        kwargs = dict(
+            a=a,
+            b=b,
+            local_rank=0,
+            train_len=train_len,
+            fp16=True,
+            deepspeed=self.get_config_dict(stage),
+        )
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            no_grad_accum_trainer = get_regression_trainer(
+                **kwargs,
+                per_device_train_batch_size=16,
+                gradient_accumulation_steps=1,
+            )
+            no_grad_accum_result = no_grad_accum_trainer.train()
+            no_grad_accum_loss = no_grad_accum_result.training_loss
+            no_grad_accum_a = no_grad_accum_trainer.model.a.item()
+            no_grad_accum_b = no_grad_accum_trainer.model.b.item()
+            # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
+            self.assertNotEqual(no_grad_accum_a, a)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            yes_grad_accum_trainer = get_regression_trainer(
+                **kwargs,
+                per_device_train_batch_size=4,
+                gradient_accumulation_steps=4,
+            )
+            yes_grad_accum_result = yes_grad_accum_trainer.train()
+            yes_grad_accum_loss = yes_grad_accum_result.training_loss
+            yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
+            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
+            self.assertNotEqual(yes_grad_accum_a, a)
+
+        # training with half the batch size but accumulation steps as 2 should give the same
+        # weights, but sometimes get a slight difference still of 1e-6
+        self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
+        self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
+
+        # see the note above how to get identical loss on a small bs
+        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
+
+    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
+        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
+
+        file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
+
+        if stage == ZERO2:
+            ds_file_list = ["mp_rank_00_model_states.pt"]
+        elif stage == ZERO3:
+            ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
+        else:
+            raise ValueError(f"unknown stage {stage}")
+
+        # XXX: this can be recoded and then removed once we require deepspeed>0.3.13
+        from packaging import version
+
+        import deepspeed
+
+        if version.parse(deepspeed.__version__) > version.parse("0.3.13"):
+            ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
+        else:
+            ds_file_list.append("zero_pp_rank_0_mp_rank_00optim_states.pt")
+
+        for step in range(freq, total, freq):
+            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
+            self.assertTrue(os.path.isdir(checkpoint), f"[{stage}] {checkpoint} dir is not found")
+
+            # common files
+            for filename in file_list:
+                path = os.path.join(checkpoint, filename)
+                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
+
+            # ds files
+            ds_path = os.path.join(checkpoint, f"global_step{step}")
+            for filename in ds_file_list:
+                # filename = os.path.join(path, filename)
+                # print(filename)
+                path = os.path.join(ds_path, filename)
+                self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
+
+    @parameterized.expand(stages)
+    def test_save_checkpoints(self, stage):
+        # adapted from  TrainerIntegrationTest.test_save_checkpoints
+
+        freq = 5
+        output_dir = self.get_auto_remove_tmp_dir()
+        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        if stage == ZERO3:
+            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
+
+        # save checkpoints
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(
+                output_dir=output_dir,
+                save_steps=freq,
+                fp16=True,
+                deepspeed=ds_config_dict,
+            )
+            trainer.train()
+
+        total = int(self.n_epochs * 64 / self.batch_size)
+        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
+
+    @parameterized.expand(stages)
+    def test_can_resume_training_errors(self, stage):
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = self.get_config_dict(stage)
+            output_dir = self.get_auto_remove_tmp_dir()
+            trainer = get_regression_trainer(output_dir=output_dir, fp16=True, deepspeed=ds_config_dict)
+
+            # 1. fail to find any checkpoint - due a fresh output_dir
+            with self.assertRaises(Exception) as context:
+                trainer.train(resume_from_checkpoint=True)
+            self.assertTrue(
+                "No valid checkpoint found in output directory" in str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
+            # 2. fail to find a bogus checkpoint
+            with self.assertRaises(Exception) as context:
+                checkpoint = os.path.join(output_dir, "checkpoint-5")
+                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
+            self.assertTrue(
+                "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
+            )
+
+    @parameterized.expand(stages)
+    def test_can_resume_training_normal(self, stage):
+        # adapted from TrainerIntegrationTest.test_can_resume_training
+        # test normal resume for each stage separately, error-handling is tested in a different test
+        output_dir = self.get_auto_remove_tmp_dir()
+        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        if stage == ZERO3:
+            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
+
+        kwargs = dict(
+            output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict
+        )
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(output_dir, "checkpoint-5")
+
+            # Reinitialize trainer
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+            # Now check with a later checkpoint that it also works when we span over one epoch
+            checkpoint = os.path.join(output_dir, "checkpoint-15")
+
+            # Reinitialize trainer and load model
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+    def test_config_object(self):
+        # test that we can switch from zero2 to zero3 in the same process for example
+        # test is_zero, etc.
+        output_dir = self.get_auto_remove_tmp_dir()
+        kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
+
+        ds_config_zero3_dict = self.get_config_dict("zero3")
+        ds_config_zero2_dict = self.get_config_dict("zero2")
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            self.assertTrue(is_deepspeed_zero3_enabled())
+
+            # test we can repeat that and with train this time
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            trainer.train()
+            self.assertTrue(is_deepspeed_zero3_enabled())
+
+            # test zero3 is disabled
+            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs)
+            self.assertFalse(is_deepspeed_zero3_enabled())
+
+            # check config obj
+            config = deepspeed_config()
+            self.assertTrue(bool(config), "Deepspeed config should be accessible")
+
+            del trainer
+            # now weakref should gc the global and we shouldn't get anything here
+            config = deepspeed_config()
+            self.assertFalse(is_deepspeed_zero3_enabled())
+            self.assertFalse(bool(config), "Deepspeed config should not be accessible")
+
+
+@slow
+@require_deepspeed
+@require_torch_gpu
+class TestDeepSpeedWithLauncher(TestCasePlus):
+    """This class is for testing via an external script - can do multiple gpus"""
+
+    # Tests to devise #
+    #
+    # 1. predict_with_generate on multigpu - need to figure out how to give input sequences so that
+    # the 2 gpus will generate prediction sequences that aren't of the same length - this is because
+    # we had to code a special feature to sync the gpus when the predicted sequences aren't of the
+    # same length. In general this will tested as a side-effect through a variety of other tests -
+    # it'll simply hang trying to synchronize with other gpus if this problem is encountered. So as
+    # long as we have a few full tests running on zero3 + predict_with_generate this should be
+    # mostly covered.
+    #
+    # but there are 5 variations on beam search in `generate`- with identical code branched with `if
+    # synced_gpus`
+    #
+    # 2. most tests should probably be run on both: zero2 and zero3 configs
+    #
+
+    @require_torch_multi_gpu
+    @parameterized.expand(stages)
+    def test_basic_distributed(self, stage):
+        self.run_and_check(stage=stage, distributed=True)
+
+    @parameterized.expand(stages)
+    def test_do_eval_no_train(self, stage):
+        # we should not fail if train is skipped
+        self.run_and_check(
+            stage=stage,
+            eval_steps=1,
+            distributed=False,
+            do_train=False,
+            do_eval=True,
+        )
+
+    @parameterized.expand(stages)
+    def test_fp32_non_distributed(self, stage):
+        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
+        # therefore no quality checks, just basic completion checks are done
+        self.run_and_check(
+            stage=stage,
+            model_name=T5_TINY,
+            distributed=False,
+            do_train=True,
+            do_eval=True,
+            quality_checks=False,
+            fp16=False,
+        )
+
+    @require_torch_multi_gpu
+    @parameterized.expand(stages)
+    def test_fp32_distributed(self, stage):
+        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
+        # therefore no quality checks, just basic completion checks are done
+        self.run_and_check(
+            stage=stage,
+            model_name=T5_TINY,
+            distributed=True,
+            do_train=True,
+            do_eval=True,
+            quality_checks=False,
+            fp16=False,
+        )
+
+    @parameterized.expand(stages)
+    def test_resume_train_not_from_ds_checkpoint(self, stage):
+        # do normal training and then resume not from the deepspeed checkpoint but explicitly from
+        # the saved model dir
+
+        do_train = True
+        do_eval = False
+        kwargs = dict(stage=stage, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
+
+        # 1. normal training
+        output_dir = self.run_and_check(**kwargs)
+
+        # 2. now resume explicitly from the saved weights, by passing --model_name_or_path output_dir
+        # - i.e. the same path the model was saved to in step 1
+        output_dir = self.run_trainer(**kwargs, model_name=output_dir)
+
+        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
+
+    def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
+
+        if do_train:
+            train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
+            self.assertIn("train_samples_per_second", train_metrics)
+            if quality_checks:
+                self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
+
+        if do_eval:
+            eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
+            self.assertIn("eval_bleu", eval_metrics)
+            if quality_checks:
+                self.assertGreater(eval_metrics["eval_bleu"], 1)
+
+    # XXX: need to do better validation beyond just that the run was successful
+    def run_and_check(
+        self,
+        stage,
+        model_name: str = T5_SMALL,
+        eval_steps: int = 10,
+        distributed: bool = True,
+        do_train: bool = True,
+        do_eval: bool = True,
+        quality_checks: bool = True,
+        fp16: bool = True,
+        extra_args_str: str = None,
+        remove_args_str: str = None,
+    ):
+
+        # we are doing quality testing so using a small real model
+        output_dir = self.run_trainer(
+            stage=stage,
+            model_name=model_name,
+            eval_steps=eval_steps,
+            num_train_epochs=1,
+            do_train=do_train,
+            do_eval=do_eval,
+            distributed=distributed,
+            fp16=fp16,
+            extra_args_str=extra_args_str,
+            remove_args_str=remove_args_str,
+        )
+
+        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval, quality_checks=quality_checks)
+
+        return output_dir
+
+    def run_trainer(
+        self,
+        stage: str,
+        model_name: str,
+        eval_steps: int = 10,
+        num_train_epochs: int = 1,
+        do_train: bool = False,
+        do_eval: bool = True,
+        distributed: bool = True,
+        fp16: bool = True,
+        extra_args_str: str = None,
+        remove_args_str: str = None,
+    ):
+        max_len = 32
+        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_name_or_path {model_name}
+            --train_file {data_dir}/train.json
+            --validation_file {data_dir}/val.json
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --max_source_length {max_len}
+            --max_target_length {max_len}
+            --val_max_target_length {max_len}
+            --warmup_steps 8
+            --predict_with_generate
+            --logging_steps 0
+            --save_steps 0
+            --eval_steps {eval_steps}
+            --group_by_length
+            --label_smoothing_factor 0.1
+            --adafactor
+            --source_lang en
+            --target_lang ro
+            --report_to none
+        """.split()
+        args.extend(["--source_prefix", '"translate English to Romanian: "'])
+
+        if fp16:
+            args.extend(["--fp16"])
+
+        actions = 0
+        if do_train:
+            actions += 1
+            args.extend(
+                f"""
+            --do_train
+            --num_train_epochs {str(num_train_epochs)}
+            --max_train_samples 16
+            --per_device_train_batch_size 2
+            --learning_rate 3e-3
+            """.split()
+            )
+
+        if do_eval:
+            actions += 1
+            args.extend(
+                """
+            --do_eval
+            --max_eval_samples 16
+            --per_device_eval_batch_size 2
+            """.split()
+            )
+
+        assert actions > 0, "need at least do_train or do_eval for the test to run"
+
+        if extra_args_str is not None:
+            args.extend(extra_args_str.split())
+
+        # currently only works for bool args
+        if remove_args_str is not None:
+            remove_args = remove_args_str.split()
+            args = [x for x in args if x not in remove_args]
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+        script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
+        launcher = self.get_launcher(distributed)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        return output_dir
+
+    @parameterized.expand(stages)
+    def test_clm(self, stage):
+        # this test exercises model.resize_token_embeddings() which requires param gathering outside
+        # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
+
+        data_dir = self.tests_dir / "fixtures"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_name_or_path sshleifer/tiny-gpt2
+            --train_file {data_dir}/sample_text.txt
+            --validation_file {data_dir}/sample_text.txt
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --do_train
+            --do_eval
+            --max_train_samples 16
+            --max_eval_samples 16
+            --per_device_train_batch_size 2
+            --per_device_eval_batch_size 2
+            --num_train_epochs 1
+            --warmup_steps 8
+            --block_size 64
+            --fp16
+            --report_to none
+            """.split()
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
+        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
+        launcher = self.get_launcher(distributed=True)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+    def test_clm_from_config_zero3(self):
+        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
+
+        data_dir = self.tests_dir / "fixtures"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_type gpt2
+            --tokenizer_name sshleifer/tiny-gpt2
+            --train_file {data_dir}/sample_text.txt
+            --validation_file {data_dir}/sample_text.txt
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --do_train
+            --max_train_samples 4
+            --per_device_train_batch_size 2
+            --num_train_epochs 1
+            --warmup_steps 8
+            --block_size 8
+            --fp16
+            --report_to none
+            """.split()
+
+        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split()
+        script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
+        launcher = self.get_launcher(distributed=True)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        with CaptureStderr() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+        assert "Detected DeepSpeed ZeRO-3" in cs.err
+
+    def get_launcher(self, distributed=False):
+        # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+        # - it won't be able to handle that
+        # 2. for now testing with just 2 gpus max (since some quality tests may give different
+        # results with mode gpus because we use very little data)
+        num_gpus = min(2, get_gpu_count()) if distributed else 1
+        return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
diff --git a/extended/__pycache__/test_trainer_ext.cpython-38-pytest-6.2.2.pyc b/extended/__pycache__/test_trainer_ext.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b9104d2e002098c4d88774e537967d4be25377c
Binary files /dev/null and b/extended/__pycache__/test_trainer_ext.cpython-38-pytest-6.2.2.pyc differ
diff --git a/extended/__pycache__/test_trainer_ext.cpython-38-pytest-6.2.4.pyc b/extended/__pycache__/test_trainer_ext.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eacb52d2603b94de51108980b25d449f7970dc59
Binary files /dev/null and b/extended/__pycache__/test_trainer_ext.cpython-38-pytest-6.2.4.pyc differ
diff --git a/extended/runs/Apr13_11-47-31_Beaver/1618328854.4132347/events.out.tfevents.1618328854.Beaver.520043.1 b/extended/runs/Apr13_11-47-31_Beaver/1618328854.4132347/events.out.tfevents.1618328854.Beaver.520043.1
new file mode 100644
index 0000000000000000000000000000000000000000..39d7702a6d13c29596f7c7d94aaa9d934d274aa6
Binary files /dev/null and b/extended/runs/Apr13_11-47-31_Beaver/1618328854.4132347/events.out.tfevents.1618328854.Beaver.520043.1 differ
diff --git a/extended/runs/Apr13_11-47-31_Beaver/events.out.tfevents.1618328854.Beaver.520043.0 b/extended/runs/Apr13_11-47-31_Beaver/events.out.tfevents.1618328854.Beaver.520043.0
new file mode 100644
index 0000000000000000000000000000000000000000..ea901051d0e585fe97a1cf024ee039e843d80c89
Binary files /dev/null and b/extended/runs/Apr13_11-47-31_Beaver/events.out.tfevents.1618328854.Beaver.520043.0 differ
diff --git a/extended/runs/Apr13_11-47-31_Beaver/events.out.tfevents.1618328857.Beaver.520043.2 b/extended/runs/Apr13_11-47-31_Beaver/events.out.tfevents.1618328857.Beaver.520043.2
new file mode 100644
index 0000000000000000000000000000000000000000..38efba2edbae3d0610dab2d131619f0bbaab884a
Binary files /dev/null and b/extended/runs/Apr13_11-47-31_Beaver/events.out.tfevents.1618328857.Beaver.520043.2 differ
diff --git a/extended/test_trainer_ext.py b/extended/test_trainer_ext.py
new file mode 100644
index 0000000000000000000000000000000000000000..93ef0ddb555a280630a71f6700a6b9bee39e2ed6
--- /dev/null
+++ b/extended/test_trainer_ext.py
@@ -0,0 +1,241 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import sys
+import unittest
+from unittest.mock import patch
+
+from transformers.file_utils import is_apex_available
+from transformers.integrations import is_fairscale_available
+from transformers.testing_utils import (
+    ExtendSysPath,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    get_torch_dist_unique_port,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    require_torch_non_multi_gpu,
+    slow,
+)
+from transformers.trainer_callback import TrainerState
+from transformers.trainer_utils import set_seed
+
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/../../examples/pytorch/translation"):
+    from run_translation import main  # noqa
+
+
+set_seed(42)
+MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
+MBART_TINY = "sshleifer/tiny-mbart"
+
+
+# a candidate for testing_utils
+def require_fairscale(test_case):
+    """
+    Decorator marking a test that requires fairscale
+    """
+    if not is_fairscale_available():
+        return unittest.skip("test requires fairscale")(test_case)
+    else:
+        return test_case
+
+
+# a candidate for testing_utils
+def require_apex(test_case):
+    """
+    Decorator marking a test that requires apex
+    """
+    if not is_apex_available():
+        return unittest.skip("test requires apex")(test_case)
+    else:
+        return test_case
+
+
+class TestTrainerExt(TestCasePlus):
+    def run_seq2seq_quick(self, distributed=False, extra_args_str=None, predict_with_generate=True):
+        output_dir = self.run_trainer(
+            eval_steps=1,
+            max_len=12,
+            model_name=MBART_TINY,
+            num_train_epochs=1,
+            distributed=distributed,
+            extra_args_str=extra_args_str,
+            predict_with_generate=predict_with_generate,
+        )
+        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
+        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
+
+        first_step_stats = eval_metrics[0]
+        if predict_with_generate:
+            assert "eval_bleu" in first_step_stats
+
+            last_step_stats = eval_metrics[-1]
+            assert isinstance(last_step_stats["eval_bleu"], float)
+            assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
+
+    @require_torch_non_multi_gpu
+    def test_run_seq2seq_no_dist(self):
+        self.run_seq2seq_quick()
+
+    # verify that the trainer can handle non-distributed with n_gpu > 1
+    @require_torch_multi_gpu
+    def test_run_seq2seq_dp(self):
+        self.run_seq2seq_quick(distributed=False)
+
+    # verify that the trainer can handle distributed with n_gpu > 1
+    @require_torch_multi_gpu
+    def test_run_seq2seq_ddp(self):
+        self.run_seq2seq_quick(distributed=True)
+
+    # test --sharded_ddp w/o --fp16
+    @require_torch_multi_gpu
+    @require_fairscale
+    def test_run_seq2seq_sharded_ddp(self):
+        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
+
+    # test --sharded_ddp w/ --fp16
+    @require_torch_multi_gpu
+    @require_fairscale
+    def test_run_seq2seq_sharded_ddp_fp16(self):
+        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")
+
+    # test --sharded_ddp zero_dp_2 w/o --fp16
+    @require_torch_multi_gpu
+    @require_fairscale
+    def test_run_seq2seq_fully_sharded_ddp(self):
+        self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
+
+    # test --sharded_ddp zero_dp_2 w/ --fp16
+    @require_torch_multi_gpu
+    @require_fairscale
+    def test_run_seq2seq_fully_sharded_ddp_fp16(self):
+        self.run_seq2seq_quick(
+            distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
+        )
+
+    @require_apex
+    @require_torch_gpu
+    def test_run_seq2seq_apex(self):
+        # XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
+        # program and it breaks other tests that run from the same pytest worker, therefore until this is
+        # sorted out it must be run only in an external program, that is distributed=True in this
+        # test and only under one or more gpus - if we want cpu will need to make a special test
+        #
+        # specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
+        # 2nd main() call it botches the future eval.
+        #
+        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
+        # test 2nd time - was getting eval_loss': nan'
+        # to reproduce the problem set distributed=False
+        self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
+
+    @slow
+    def test_run_seq2seq_slow(self):
+        output_dir = self.run_trainer(
+            eval_steps=2,
+            max_len=128,
+            model_name=MARIAN_MODEL,
+            learning_rate=3e-4,
+            num_train_epochs=10,
+            distributed=False,
+        )
+
+        # Check metrics
+        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
+        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
+        first_step_stats = eval_metrics[0]
+        last_step_stats = eval_metrics[-1]
+
+        assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
+        assert isinstance(last_step_stats["eval_bleu"], float)
+
+        # test if do_predict saves generations and metrics
+        contents = os.listdir(output_dir)
+        contents = {os.path.basename(p) for p in contents}
+        assert "generated_predictions.txt" in contents
+        assert "predict_results.json" in contents
+
+    def run_trainer(
+        self,
+        eval_steps: int,
+        max_len: int,
+        model_name: str,
+        num_train_epochs: int,
+        learning_rate: float = 3e-3,
+        distributed: bool = False,
+        extra_args_str: str = None,
+        predict_with_generate: bool = True,
+    ):
+        data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_name_or_path {model_name}
+            --train_file {data_dir}/train.json
+            --validation_file {data_dir}/val.json
+            --test_file {data_dir}/test.json
+            --output_dir {output_dir}
+            --overwrite_output_dir
+            --max_train_samples 8
+            --max_eval_samples 8
+            --max_source_length {max_len}
+            --max_target_length {max_len}
+            --val_max_target_length {max_len}
+            --do_train
+            --do_eval
+            --do_predict
+            --num_train_epochs {str(num_train_epochs)}
+            --per_device_train_batch_size 4
+            --per_device_eval_batch_size 4
+            --learning_rate {learning_rate}
+            --warmup_steps 8
+            --evaluation_strategy steps
+            --logging_steps 0
+            --eval_steps {str(eval_steps)}
+            --save_steps {str(eval_steps)}
+            --group_by_length
+            --label_smoothing_factor 0.1
+            --adafactor
+            --target_lang ro_RO
+            --source_lang en_XX
+        """
+        if predict_with_generate:
+            args += "--predict_with_generate"
+
+        args = args.split()
+
+        if extra_args_str is not None:
+            args.extend(extra_args_str.split())
+
+        if distributed:
+            n_gpu = get_gpu_count()
+            master_port = get_torch_dist_unique_port()
+            distributed_args = f"""
+                -m torch.distributed.launch
+                --nproc_per_node={n_gpu}
+                --master_port={master_port}
+                {self.examples_dir_str}/pytorch/translation/run_translation.py
+            """.split()
+            cmd = [sys.executable] + distributed_args + args
+            execute_subprocess_async(cmd, env=self.get_env())
+        else:
+            testargs = ["run_translation.py"] + args
+            with patch.object(sys, "argv", testargs):
+                main()
+
+        return output_dir
diff --git a/fixtures/dummy-config.json b/fixtures/dummy-config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e388bdf71151db7c014ae6e0174dd07c1a6acbee
--- /dev/null
+++ b/fixtures/dummy-config.json
@@ -0,0 +1,3 @@
+{
+  "model_type": "roberta"
+}
\ No newline at end of file
diff --git a/fixtures/dummy_feature_extractor_config.json b/fixtures/dummy_feature_extractor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf0c5dce6c42b8a188462b362cbaaa0b3d76e01e
--- /dev/null
+++ b/fixtures/dummy_feature_extractor_config.json
@@ -0,0 +1,3 @@
+{
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor"
+}
\ No newline at end of file
diff --git a/fixtures/empty.txt b/fixtures/empty.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fixtures/input.txt b/fixtures/input.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d1e3f410d07833e4c5c233ffd54f8d2b54ebb7cf
--- /dev/null
+++ b/fixtures/input.txt
@@ -0,0 +1 @@
+Who was Jim Henson ? ||| Jim Henson was a puppeteer
diff --git a/fixtures/preprocessor_config.json b/fixtures/preprocessor_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf0c5dce6c42b8a188462b362cbaaa0b3d76e01e
--- /dev/null
+++ b/fixtures/preprocessor_config.json
@@ -0,0 +1,3 @@
+{
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor"
+}
\ No newline at end of file
diff --git a/fixtures/sample_text.txt b/fixtures/sample_text.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a42812060c576bae870eb29b1ac083fda0d239d3
--- /dev/null
+++ b/fixtures/sample_text.txt
@@ -0,0 +1,33 @@
+This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is public domain and was randomly selected from Project Guttenberg.
+
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+Gold, surely!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/fixtures/sample_text_no_unicode.txt b/fixtures/sample_text_no_unicode.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74646661c7c121a31af30def84a4d724a4b2d41f
--- /dev/null
+++ b/fixtures/sample_text_no_unicode.txt
@@ -0,0 +1,32 @@
+Text should be one-sentence-per-line, with empty lines between documents.
+This sample text is public domain and was randomly selected from Project Guttenberg.
+
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+Gold, surely!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/fixtures/spiece.model b/fixtures/spiece.model
new file mode 100644
index 0000000000000000000000000000000000000000..c91b8acfa56ccfc80e1cdd854ddcaf9b6c44ab2a
Binary files /dev/null and b/fixtures/spiece.model differ
diff --git a/fixtures/test_sentencepiece.model b/fixtures/test_sentencepiece.model
new file mode 100644
index 0000000000000000000000000000000000000000..376dda73010c6f93acfa3b974bea81a9ac9e1740
Binary files /dev/null and b/fixtures/test_sentencepiece.model differ
diff --git a/fixtures/test_sentencepiece_bpe.model b/fixtures/test_sentencepiece_bpe.model
new file mode 100644
index 0000000000000000000000000000000000000000..a75dee72cb00ae836bcc4fad45ce26f42f9f2c67
Binary files /dev/null and b/fixtures/test_sentencepiece_bpe.model differ
diff --git a/fixtures/test_sentencepiece_no_bos.model b/fixtures/test_sentencepiece_no_bos.model
new file mode 100644
index 0000000000000000000000000000000000000000..c3336ae60c71d2545bb0c0e7e8304d9cc1e9479c
Binary files /dev/null and b/fixtures/test_sentencepiece_no_bos.model differ
diff --git a/fixtures/tests_samples/.gitignore b/fixtures/tests_samples/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1d7141c43dcf8f422e18fe3c13eb8f9e0bb9a964
--- /dev/null
+++ b/fixtures/tests_samples/.gitignore
@@ -0,0 +1,6 @@
+cache*
+temp*
+!*.txt
+!*.tsv
+!*.json
+!.gitignore 
\ No newline at end of file
diff --git a/fixtures/tests_samples/COCO/000000039769.png b/fixtures/tests_samples/COCO/000000039769.png
new file mode 100644
index 0000000000000000000000000000000000000000..a3b5225fc3cef5c492cc109aebe883f24941a156
Binary files /dev/null and b/fixtures/tests_samples/COCO/000000039769.png differ
diff --git a/fixtures/tests_samples/COCO/coco_annotations.txt b/fixtures/tests_samples/COCO/coco_annotations.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bd8c86a9bc3cbbc2f12e5efc58a805c7e9346d37
--- /dev/null
+++ b/fixtures/tests_samples/COCO/coco_annotations.txt
@@ -0,0 +1 @@
+[{"segmentation": [[333.96, 175.14, 338.26, 134.33, 342.55, 95.67, 348.99, 79.57, 368.32, 80.64, 371.54, 91.38, 364.03, 106.41, 356.51, 145.07, 351.14, 166.55, 350.07, 184.8, 345.77, 185.88, 332.89, 178.36, 332.89, 172.99]], "area": 2120.991099999999, "iscrowd": 0, "image_id": 39769, "bbox": [332.89, 79.57, 38.65, 106.31], "category_id": 75, "id": 1108446}, {"segmentation": [[44.03, 86.01, 112.75, 74.2, 173.96, 77.42, 175.03, 89.23, 170.74, 98.9, 147.11, 102.12, 54.77, 119.3, 53.69, 119.3, 44.03, 113.93, 41.88, 94.6, 41.88, 94.6]], "area": 4052.607, "iscrowd": 0, "image_id": 39769, "bbox": [41.88, 74.2, 133.15, 45.1], "category_id": 75, "id": 1110067}, {"segmentation": [[1.08, 473.53, 633.17, 473.53, 557.66, 376.45, 535.01, 366.74, 489.71, 305.26, 470.29, 318.2, 456.27, 351.64, 413.12, 363.51, 376.45, 358.11, 348.4, 350.56, 363.51, 331.15, 357.03, 288.0, 353.8, 257.8, 344.09, 190.92, 333.3, 177.98, 345.17, 79.82, 284.76, 130.52, 265.35, 151.01, 308.49, 189.84, 317.12, 215.73, 293.39, 243.78, 269.66, 212.49, 235.15, 199.55, 214.65, 193.08, 187.69, 217.89, 159.64, 278.29, 135.91, 313.89, 169.35, 292.31, 203.87, 281.53, 220.04, 292.31, 220.04, 307.42, 175.82, 345.17, 155.33, 360.27, 105.71, 363.51, 85.21, 374.29, 74.43, 366.74, 70.11, 465.98, 42.07, 471.37, 33.44, 457.35, 34.52, 414.2, 29.12, 368.9, 9.71, 291.24, 46.38, 209.26, 99.24, 128.36, 131.6, 107.87, 50.7, 117.57, 40.99, 103.55, 40.99, 85.21, 60.4, 77.66, 141.3, 70.11, 173.66, 72.27, 174.74, 92.76, 204.94, 72.27, 225.44, 62.56, 262.11, 56.09, 292.31, 53.93, 282.61, 81.98, 298.79, 96.0, 310.65, 102.47, 348.4, 74.43, 373.21, 81.98, 430.38, 35.6, 484.31, 23.73, 540.4, 46.38, 593.26, 66.88, 638.56, 80.9, 632.09, 145.62, 581.39, 118.65, 543.64, 130.52, 533.93, 167.19, 512.36, 197.39, 498.34, 218.97, 529.62, 253.48, 549.03, 273.98, 584.63, 276.13, 587.87, 293.39, 566.29, 305.26, 531.78, 298.79, 549.03, 319.28, 576.0, 358.11, 560.9, 376.45, 639.64, 471.37, 639.64, 2.16, 1.08, 0.0]], "area": 176277.55269999994, "iscrowd": 0, "image_id": 39769, "bbox": [1.08, 0.0, 638.56, 473.53], "category_id": 63, "id": 1605237}, {"segmentation": [[1.07, 1.18, 640.0, 3.33, 638.93, 472.59, 4.3, 479.03]], "area": 301552.6694999999, "iscrowd": 0, "image_id": 39769, "bbox": [1.07, 1.18, 638.93, 477.85], "category_id": 65, "id": 1612051}, {"segmentation": [[138.75, 319.38, 148.75, 294.38, 165.0, 246.87, 197.5, 205.63, 247.5, 203.13, 268.75, 216.88, 280.0, 239.38, 293.75, 244.38, 303.75, 241.88, 307.5, 228.13, 318.75, 220.63, 315.0, 200.63, 291.25, 171.88, 265.0, 156.88, 258.75, 148.13, 262.5, 135.63, 282.5, 123.13, 292.5, 115.63, 311.25, 108.13, 313.75, 106.88, 296.25, 93.13, 282.5, 84.38, 292.5, 64.38, 288.75, 60.63, 266.25, 54.38, 232.5, 63.12, 206.25, 70.63, 170.0, 100.63, 136.25, 114.38, 101.25, 138.13, 56.25, 194.38, 27.5, 259.38, 17.5, 299.38, 32.5, 378.13, 31.25, 448.13, 41.25, 469.38, 66.25, 466.88, 70.0, 419.38, 71.25, 391.88, 77.5, 365.63, 113.75, 364.38, 145.0, 360.63, 168.75, 349.38, 191.25, 330.63, 212.5, 319.38, 223.75, 305.63, 206.25, 286.88, 172.5, 288.13]], "area": 53301.618749999994, "iscrowd": 0, "image_id": 39769, "bbox": [17.5, 54.38, 301.25, 415.0], "category_id": 17, "id": 2190839}, {"segmentation": [[543.75, 136.88, 570.0, 114.38, 591.25, 123.13, 616.25, 140.63, 640.0, 143.13, 636.25, 124.37, 605.0, 103.13, 640.0, 103.13, 633.75, 86.88, 587.5, 73.13, 548.75, 49.38, 505.0, 35.63, 462.5, 25.63, 405.0, 48.13, 362.5, 111.88, 347.5, 179.38, 355.0, 220.63, 356.25, 230.63, 365.0, 264.38, 358.75, 266.88, 358.75, 270.63, 356.25, 291.88, 356.25, 325.63, 355.0, 338.13, 350.0, 348.13, 365.0, 354.38, 396.25, 351.88, 423.75, 355.63, 446.25, 350.63, 460.0, 345.63, 462.5, 321.88, 468.75, 306.88, 481.25, 299.38, 516.25, 341.88, 536.25, 368.13, 570.0, 369.38, 578.75, 359.38, 555.0, 330.63, 532.5, 298.13, 563.75, 299.38, 582.5, 298.13, 586.25, 286.88, 578.75, 278.13, 548.75, 269.38, 525.0, 256.88, 505.0, 206.88, 536.25, 161.88, 540.0, 149.38]], "area": 59700.95625, "iscrowd": 0, "image_id": 39769, "bbox": [347.5, 25.63, 292.5, 343.75], "category_id": 17, "id": 2190842}]
\ No newline at end of file
diff --git a/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png b/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png
new file mode 100644
index 0000000000000000000000000000000000000000..9dc23525d6ead4c25118ed9fb6c3dae5a8f76ecc
Binary files /dev/null and b/fixtures/tests_samples/COCO/coco_panoptic/000000039769.png differ
diff --git a/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt b/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
new file mode 100644
index 0000000000000000000000000000000000000000..90a9798be2a2abbc359b698799795c918d4a787b
--- /dev/null
+++ b/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt
@@ -0,0 +1 @@
+[{"id": 8222595, "category_id": 17, "iscrowd": 0, "bbox": [18, 54, 301, 415], "area": 53306}, {"id": 8225432, "category_id": 17, "iscrowd": 0, "bbox": [349, 26, 291, 343], "area": 59627}, {"id": 8798150, "category_id": 63, "iscrowd": 0, "bbox": [1, 0, 639, 474], "area": 174579}, {"id": 14466198, "category_id": 75, "iscrowd": 0, "bbox": [42, 74, 133, 45], "area": 4068}, {"id": 12821912, "category_id": 75, "iscrowd": 0, "bbox": [333, 80, 38, 106], "area": 2118}, {"id": 10898909, "category_id": 93, "iscrowd": 0, "bbox": [0, 0, 640, 480], "area": 2750}]
\ No newline at end of file
diff --git a/fixtures/tests_samples/GermEval/dev.txt b/fixtures/tests_samples/GermEval/dev.txt
new file mode 100644
index 0000000000000000000000000000000000000000..de001582302780954c8f00af5531372df290a43b
--- /dev/null
+++ b/fixtures/tests_samples/GermEval/dev.txt
@@ -0,0 +1,202 @@
+Gleich O
+darauf O
+entwirft O
+er O
+seine O
+Selbstdarstellung O
+" O
+Ecce B-OTH
+homo I-OTH
+" O
+in O
+enger O
+Auseinandersetzung O
+mit O
+diesem O
+Bild O
+Jesu B-PER
+. O
+
+1980 O
+kam O
+der O
+Crown B-OTH
+als O
+Versuch O
+von O
+Toyota B-ORG
+, O
+sich O
+in O
+der O
+Oberen O
+Mittelklasse O
+zu O
+etablieren O
+, O
+auch O
+nach O
+Deutschland B-LOC
+. O
+
+– O
+4:26 O
+# O
+Sometime B-OTH
+Ago/La I-OTH
+Fiesta I-OTH
+– O
+23:18 O
+Alle O
+Stücke O
+wurden O
+von O
+Corea B-PER
+komponiert O
+mit O
+Ausnahme O
+der O
+einleitenden O
+Improvisation O
+zu O
+Sometime B-OTH
+Ago I-OTH
+. O
+
+Bis O
+2013 O
+steigen O
+die O
+Mittel O
+aus O
+dem O
+EU-Budget B-ORGpart
+auf O
+rund O
+120 O
+Millionen O
+Euro B-OTH
+. O
+
+Daraus O
+entwickelte O
+sich O
+im O
+Rokoko B-OTH
+die O
+Sitte O
+des O
+gemeinsamen O
+Weinens O
+im O
+Theater O
+, O
+das O
+die O
+Standesgrenzen O
+innerhalb O
+des O
+Publikums O
+überbrücken O
+sollte O
+. O
+
+Die O
+Spinne O
+hatte O
+sie O
+mit O
+Seidenfäden O
+an O
+ihrem O
+Schwanz O
+gefesselt O
+und O
+nach O
+oben O
+gezogen O
+. O
+
+In O
+Deutschland B-LOC
+ist O
+nach O
+StGB O
+eine O
+Anwerbung O
+für O
+die O
+Fremdenlegion O
+strafbar O
+. O
+
+Am O
+Donnerstag O
+wird O
+sich O
+zeigen O
+, O
+ob O
+die O
+Idee O
+der O
+DLR-Forscher B-ORGpart
+funktioniert O
+. O
+
+Der O
+sechste O
+Lauf O
+der O
+ADAC B-ORG
+GT I-ORG
+Mastersstand O
+ganz O
+klar O
+im O
+Mittelpunkt O
+des O
+Motorsport-Wochenendes O
+auf O
+dem O
+Eurospeedway B-ORG
+Lausitz I-ORG
+. O
+
+Nach O
+den O
+schwächeren O
+Vorgaben O
+der O
+Wall B-ORG
+Street I-ORG
+vom O
+Vortag O
+setzten O
+die O
+deutschen B-LOCderiv
+Standardwerte O
+ihren O
+Konsolidierungskurs O
+fort O
+. O
+
+Kolb B-PER
+war O
+seit O
+1986 O
+im O
+Turnverein O
+als O
+Leiter O
+tätig O
+, O
+darunter O
+elf O
+Jahre O
+als O
+Hauptleiter O
+in O
+der O
+Männerriege O
+. O
diff --git a/fixtures/tests_samples/GermEval/labels.txt b/fixtures/tests_samples/GermEval/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a781cbd47ee29d10dc1b8cf823c4ec9600ba0355
--- /dev/null
+++ b/fixtures/tests_samples/GermEval/labels.txt
@@ -0,0 +1,25 @@
+B-LOC
+B-LOCderiv
+B-LOCpart
+B-ORG
+B-ORGderiv
+B-ORGpart
+B-OTH
+B-OTHderiv
+B-OTHpart
+B-PER
+B-PERderiv
+B-PERpart
+I-LOC
+I-LOCderiv
+I-LOCpart
+I-ORG
+I-ORGderiv
+I-ORGpart
+I-OTH
+I-OTHderiv
+I-OTHpart
+I-PER
+I-PERderiv
+I-PERpart
+O
diff --git a/fixtures/tests_samples/GermEval/train.txt b/fixtures/tests_samples/GermEval/train.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3d613ae1ee9b07901f14bf9107b042fa071c3525
--- /dev/null
+++ b/fixtures/tests_samples/GermEval/train.txt
@@ -0,0 +1,200 @@
+Schartau B-PER
+sagte O
+dem O
+" O
+Tagesspiegel B-ORG
+" O
+vom O
+Freitag O
+, O
+Fischer B-PER
+sei O
+" O
+in O
+einer O
+Weise O
+aufgetreten O
+, O
+die O
+alles O
+andere O
+als O
+überzeugend O
+war O
+" O
+. O
+
+Firmengründer O
+Wolf B-PER
+Peter I-PER
+Bree I-PER
+arbeitete O
+Anfang O
+der O
+siebziger O
+Jahre O
+als O
+Möbelvertreter O
+, O
+als O
+er O
+einen O
+fliegenden O
+Händler O
+aus O
+dem O
+Libanon B-LOC
+traf O
+. O
+
+Ob O
+sie O
+dabei O
+nach O
+dem O
+Runden O
+Tisch O
+am O
+23. O
+April O
+in O
+Berlin B-LOC
+durch O
+ein O
+pädagogisches O
+Konzept O
+unterstützt O
+wird O
+, O
+ist O
+allerdings O
+zu O
+bezweifeln O
+. O
+
+Bayern B-ORG
+München I-ORG
+ist O
+wieder O
+alleiniger O
+Top- O
+Favorit O
+auf O
+den O
+Gewinn O
+der O
+deutschen B-LOCderiv
+Fußball-Meisterschaft O
+. O
+
+Dabei O
+hätte O
+der O
+tapfere O
+Schlussmann O
+allen O
+Grund O
+gehabt O
+, O
+sich O
+viel O
+früher O
+aufzuregen O
+. O
+
+ARD-Programmchef B-ORGpart
+Günter B-PER
+Struve I-PER
+war O
+wegen O
+eines O
+vierwöchigen O
+Urlaubs O
+für O
+eine O
+Stellungnahme O
+nicht O
+erreichbar O
+. O
+
+Alternativ O
+sollten O
+sich O
+die O
+Restaurantbetreiber O
+aus O
+Sicht O
+der O
+Solingerin B-LOCderiv
+zu O
+längeren O
+Öffnungszeiten O
+verpflichten O
+, O
+um O
+wartende O
+Kunden O
+aufzunehmen O
+. O
+
+Die O
+Deutsche B-ORG
+Flugsicherung I-ORG
+( O
+DFS B-ORG
+) O
+beschloss O
+ein O
+Flugverbot O
+für O
+alle O
+internationalen O
+Flughäfen O
+mit O
+Ausnahme O
+der O
+beiden O
+Berliner B-LOCderiv
+Flughäfen O
+bis O
+2.00 O
+Uhr O
+nachts O
+. O
+
+New O
+Small O
+Family O
+mit O
+E-Motor O
+: O
+Studie O
+E-Up O
+! O
+
+Eine O
+Schwachstelle O
+war O
+beispielsweise O
+der O
+Spiegelkasten O
+. O
+
+Denn O
+durch O
+den O
+Einsatz O
+moderner O
+Fahrzeugtechnik O
+( O
+Dieseltriebwagen O
+) O
+und O
+schalldämmender O
+Fenster O
+entsteht O
+keine O
+Einschränkung O
+der O
+Wohnqualität O
+. O
diff --git a/fixtures/tests_samples/MRPC/dev.csv b/fixtures/tests_samples/MRPC/dev.csv
new file mode 100644
index 0000000000000000000000000000000000000000..96beccda96d7e164e4484e037a52fb338cc22180
--- /dev/null
+++ b/fixtures/tests_samples/MRPC/dev.csv
@@ -0,0 +1,7 @@
+label,sentence1,sentence2
+equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
+not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
+not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
+equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
+equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/fixtures/tests_samples/MRPC/dev.tsv b/fixtures/tests_samples/MRPC/dev.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..5b814856c63f44ef8c082726ae19285a4faec26c
--- /dev/null
+++ b/fixtures/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/fixtures/tests_samples/MRPC/train.csv b/fixtures/tests_samples/MRPC/train.csv
new file mode 100644
index 0000000000000000000000000000000000000000..96beccda96d7e164e4484e037a52fb338cc22180
--- /dev/null
+++ b/fixtures/tests_samples/MRPC/train.csv
@@ -0,0 +1,7 @@
+label,sentence1,sentence2
+equivalent,He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .,""" The foodservice pie business does not fit our long-term growth strategy ."
+not_equivalent,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war ."
+not_equivalent,"The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .","The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent ."
+equivalent,The AFL-CIO is waiting until October to decide if it will endorse a candidate .,The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+not_equivalent,No dates have been set for the civil or the criminal trial .,"No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty ."
+equivalent,Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .,It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/fixtures/tests_samples/MRPC/train.tsv b/fixtures/tests_samples/MRPC/train.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..5b814856c63f44ef8c082726ae19285a4faec26c
--- /dev/null
+++ b/fixtures/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/fixtures/tests_samples/SQUAD/sample.json b/fixtures/tests_samples/SQUAD/sample.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed3dcc27d721f4a09ac3f23fee07f6e64441535d
--- /dev/null
+++ b/fixtures/tests_samples/SQUAD/sample.json
@@ -0,0 +1,201 @@
+{
+    "version": 2.0,
+    "data": [
+        {
+            "id": "56ddde6b9a695914005b9628",
+            "question": "In what country is Normandy located?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [
+                    159,
+                    159,
+                    159,
+                    159
+                ],
+                "text": [
+                    "France",
+                    "France",
+                    "France",
+                    "France"
+                ]
+            }
+        },
+        {
+            "id": "56ddde6b9a695914005b9629",
+            "question": "When were the Normans in Normandy?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [
+                    94,
+                    87,
+                    94,
+                    94
+                ],
+                "text": [
+                    "10th and 11th centuries",
+                    "in the 10th and 11th centuries",
+                    "10th and 11th centuries",
+                    "10th and 11th centuries"
+                ]
+            }
+        },
+        {
+            "id": "56ddde6b9a695914005b962a",
+            "question": "From which countries did the Norse originate?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [
+                    256,
+                    256,
+                    256,
+                    256
+                ],
+                "text": [
+                    "Denmark, Iceland and Norway",
+                    "Denmark, Iceland and Norway",
+                    "Denmark, Iceland and Norway",
+                    "Denmark, Iceland and Norway"
+                ]
+            }
+        },
+        {
+            "id": "5ad39d53604f3c001a3fe8d3",
+            "question": "Who did King Charles III swear fealty to?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "5ad39d53604f3c001a3fe8d4",
+            "question": "When did the Frankish identity emerge?",
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "56dddf4066d3e219004dad5f",
+            "question": "Who was the duke in the battle of Hastings?",
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.",
+            "answers": {
+                "answer_start": [
+                    1022,
+                    1022,
+                    1022
+                ],
+                "text": [
+                    "William the Conqueror",
+                    "William the Conqueror",
+                    "William the Conqueror"
+                ]
+            }
+        },
+        {
+            "id": "5ad3a266604f3c001a3fea2b",
+            "question": "What principality did William the conquerer found?",
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "56e16182e3433e1400422e28",
+            "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm.",
+            "answers": {
+                "answer_start": [
+                    0,
+                    0,
+                    0
+                ],
+                "text": [
+                    "Computational complexity theory",
+                    "Computational complexity theory",
+                    "Computational complexity theory"
+                ]
+            }
+        },
+        {
+            "id": "5ad5316b5b96ef001a10ab76",
+            "question": "What is a manual application of mathematical steps?",
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "56e16839cd28a01900c67887",
+            "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [
+                    46,
+                    49,
+                    46
+                ],
+                "text": [
+                    "if its solution requires significant resources",
+                    "its solution requires significant resources",
+                    "if its solution requires significant resources"
+                ]
+            }
+        },
+        {
+            "id": "56e16839cd28a01900c67888",
+            "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [
+                    176,
+                    176,
+                    176
+                ],
+                "text": [
+                    "mathematical models of computation",
+                    "mathematical models of computation",
+                    "mathematical models of computation"
+                ]
+            }
+        },
+        {
+            "id": "56e16839cd28a01900c67889",
+            "question": "What are two basic primary resources used to guage complexity?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [
+                    305,
+                    305,
+                    305
+                ],
+                "text": [
+                    "time and storage",
+                    "time and storage",
+                    "time and storage"
+                ]
+            }
+        },
+        {
+            "id": "5ad532575b96ef001a10ab7f",
+            "question": "What unit is measured to determine circuit simplicity?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        },
+        {
+            "id": "5ad532575b96ef001a10ab80",
+            "question": "What number is used in perpendicular computing?",
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do.",
+            "answers": {
+                "answer_start": [],
+                "text": []
+            }
+        }
+    ]
+}
diff --git a/fixtures/tests_samples/STS-B/dev.tsv b/fixtures/tests_samples/STS-B/dev.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..8d689c2ccc67dc4cd533562da00e1731f80902f3
--- /dev/null
+++ b/fixtures/tests_samples/STS-B/dev.tsv
@@ -0,0 +1,10 @@
+index	genre	filename	year	old_index	source1	source2	sentence1	sentence2	score
+0	main-captions	MSRvid	2012test	0000	none	none	A man with a hard hat is dancing.	A man wearing a hard hat is dancing.	5.000
+1	main-captions	MSRvid	2012test	0002	none	none	A young child is riding a horse.	A child is riding a horse.	4.750
+2	main-captions	MSRvid	2012test	0003	none	none	A man is feeding a mouse to a snake.	The man is feeding a mouse to the snake.	5.000
+3	main-captions	MSRvid	2012test	0007	none	none	A woman is playing the guitar.	A man is playing guitar.	2.400
+4	main-captions	MSRvid	2012test	0008	none	none	A woman is playing the flute.	A man is playing a flute.	2.750
+5	main-captions	MSRvid	2012test	0010	none	none	A woman is cutting an onion.	A man is cutting onions.	2.615
+6	main-captions	MSRvid	2012test	0015	none	none	A man is erasing a chalk board.	The man is erasing the chalk board.	5.000
+7	main-captions	MSRvid	2012test	0023	none	none	A woman is carrying a boy.	A woman is carrying her baby.	2.333
+8	main-captions	MSRvid	2012test	0027	none	none	Three men are playing guitars.	Three men are on stage playing guitars.	3.750
diff --git a/fixtures/tests_samples/STS-B/train.tsv b/fixtures/tests_samples/STS-B/train.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..a38be956d6020fff987ad2cd73bc576f2986d36b
--- /dev/null
+++ b/fixtures/tests_samples/STS-B/train.tsv
@@ -0,0 +1,10 @@
+index	genre	filename	year	old_index	source1	source2	sentence1	sentence2	score
+0	main-captions	MSRvid	2012test	0001	none	none	A plane is taking off.	An air plane is taking off.	5.000
+1	main-captions	MSRvid	2012test	0004	none	none	A man is playing a large flute.	A man is playing a flute.	3.800
+2	main-captions	MSRvid	2012test	0005	none	none	A man is spreading shreded cheese on a pizza.	A man is spreading shredded cheese on an uncooked pizza.	3.800
+3	main-captions	MSRvid	2012test	0006	none	none	Three men are playing chess.	Two men are playing chess.	2.600
+4	main-captions	MSRvid	2012test	0009	none	none	A man is playing the cello.	A man seated is playing the cello.	4.250
+5	main-captions	MSRvid	2012test	0011	none	none	Some men are fighting.	Two men are fighting.	4.250
+6	main-captions	MSRvid	2012test	0012	none	none	A man is smoking.	A man is skating.	0.500
+7	main-captions	MSRvid	2012test	0013	none	none	The man is playing the piano.	The man is playing the guitar.	1.600
+8	main-captions	MSRvid	2012test	0014	none	none	A man is playing on a guitar and singing.	A woman is playing an acoustic guitar and singing.	2.200
diff --git a/fixtures/tests_samples/conll/sample.json b/fixtures/tests_samples/conll/sample.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bc42a92fe8c934850df8967a293eb8df7cd3c88
--- /dev/null
+++ b/fixtures/tests_samples/conll/sample.json
@@ -0,0 +1,10 @@
+{"words": ["He", "was", "the", "27th", "pitcher", "used", "by", "the", "Angels", "this", "season", ",", "tying", "a", "major-league", "record", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["CHICAGO", "AT", "ATLANTA"], "ner": ["B-ORG", "O", "B-LOC"]}
+{"words": ["President", "Bill", "Clinton", "earlier", "this", "month", "invoked", "special", "powers", "to", "appoint", "Fowler", "during", "the", "congressional", "recess", "because", "the", "Senate", "delayed", "confirming", "his", "nomination", "."], "ner": ["O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O", "O", "O", "O"]}
+{"words": ["goals", "for", ",", "goals", "against", ",", "points", ")", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["\"", "It", "is", "one", "step", "short", "of", "an", "emergency", "situation", ",", "\"", "a", "police", "spokesman", "said", "via", "telephone", "from", "a", "command", "post", "in", "the", "bush", "."], "ner": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["U.S.", "Ambassador", "Myles", "Frechette", "applauded", "the", "move", ",", "saying", "it", "could", "prompt", "the", "Clinton", "administration", "to", "remove", "Colombia", "from", "a", "list", "of", "outcast", "nations", "that", "have", "failed", "to", "cooperate", "in", "U.S.", "counternarcotics", "efforts", "."], "ner": ["B-LOC", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O"]}
+{"words": ["Halftime"], "ner": ["O"]}
+{"words": ["It", "has", "manufacturing", "plants", "in", "San", "Diego", ";", "Creedmoor", ",", "N.C.", ";", "Hampshire", ",", "England", ";", "and", "Tijuana", ",", "Mexico", ",", "and", "distributes", "its", "prodcuts", "in", "more", "than", "120", "countries", "."], "ner": ["O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["Scotland", "manager", "Craig", "Brown", "said", "on", "Thursday", ":", "\"", "I", "'ve", "watched", "Duncan", "Ferguson", "in", "action", "twice", "recently", "and", "he", "'s", "bang", "in", "form", "."], "ner": ["B-LOC", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"words": ["Clinton", "flew", "in", "by", "helicopter", "from", "Michigan", "City", ",", "Indiana", ",", "after", "ending", "a", "four-day", ",", "559-mile", "trip", "aboard", "a", "campaign", "train", "from", "Washington", "."], "ner": ["B-PER", "O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O"]}
\ No newline at end of file
diff --git a/fixtures/tests_samples/swag/sample.json b/fixtures/tests_samples/swag/sample.json
new file mode 100644
index 0000000000000000000000000000000000000000..d00ad8d184e380570c05836c3c1e167f46256cbb
--- /dev/null
+++ b/fixtures/tests_samples/swag/sample.json
@@ -0,0 +1,10 @@
+{"ending0": "passes by walking down the street playing their instruments.", "ending1": "has heard approaching them.", "ending2": "arrives and they're outside dancing and asleep.", "ending3": "turns the lead singer watches the performance.", "label": 0, "sent1": "Members of the procession walk down the street holding small horn brass instruments.", "sent2": "A drum line"}
+{"ending0": "are playing ping pong and celebrating one left each in quick.", "ending1": "wait slowly towards the cadets.", "ending2": "continues to play as well along the crowd along with the band being interviewed.", "ending3": "continue to play marching, interspersed.", "label": 3, "sent1": "A drum line passes by walking down the street playing their instruments.", "sent2": "Members of the procession"}
+{"ending0": "pay the other coaches to cheer as people this chatter dips in lawn sheets.", "ending1": "walk down the street holding small horn brass instruments.", "ending2": "is seen in the background.", "ending3": "are talking a couple of people playing a game of tug of war.", "label": 1, "sent1": "A group of members in green uniforms walks waving flags.", "sent2": "Members of the procession"}
+{"ending0": "are playing ping pong and celebrating one left each in quick.", "ending1": "wait slowly towards the cadets.", "ending2": "makes a square call and ends by jumping down into snowy streets where fans begin to take their positions.", "ending3": "play and go back and forth hitting the drums while the audience claps for them.", "label": 3, "sent1": "A drum line passes by walking down the street playing their instruments.", "sent2": "Members of the procession"}
+{"ending0": "finishes the song and lowers the instrument.", "ending1": "hits the saxophone and demonstrates how to properly use the racquet.", "ending2": "finishes massage the instrument again and continues.", "ending3": "continues dancing while the man gore the music outside while drums.", "label": 0, "sent1": "The person plays a song on the violin.", "sent2": "The man"}
+{"ending0": "finishes playing then marches their tenderly.", "ending1": "walks in frame and rubs on his hands, and then walks into a room.", "ending2": "continues playing guitar while moving from the camera.", "ending3": "plays a song on the violin.", "label": 3, "sent1": "The person holds up the violin to his chin and gets ready.", "sent2": "The person"}
+{"ending0": "examines the instrument in his hand.", "ending1": "stops playing the drums and waves over the other boys.", "ending2": "lights the cigarette and sticks his head in.", "ending3": "drags off the vacuum.", "label": 0, "sent1": "A person retrieves an instrument from a closet.", "sent2": "The man"}
+{"ending0": "studies a picture of the man playing the violin.", "ending1": "holds up the violin to his chin and gets ready.", "ending2": "stops to speak to the camera again.", "ending3": "puts his arm around the man and backs away.", "label": 1, "sent1": "The man examines the instrument in his hand.", "sent2": "The person"}
+{"ending0": "hands her another phone.", "ending1": "takes the drink, then holds it.", "ending2": "looks off then looks at someone.", "ending3": "stares blearily down at the floor.", "label": 3, "sent1": "Someone walks over to the radio.", "sent2": "Someone"}
+{"ending0": "looks off then looks at someone.", "ending1": "hands her another phone.", "ending2": "takes the drink, then holds it.", "ending3": "turns on a monitor.", "label": 3, "sent1": "Someone walks over to the radio.", "sent2": "Someone"}
diff --git a/fixtures/tests_samples/wiki_text/wiki_00 b/fixtures/tests_samples/wiki_text/wiki_00
new file mode 100644
index 0000000000000000000000000000000000000000..773074910b487eed863883642ea192b972bfc84b
--- /dev/null
+++ b/fixtures/tests_samples/wiki_text/wiki_00
@@ -0,0 +1,251 @@
+<doc id="12" url="https://en.wikipedia.org/wiki?curid=12" title="Anarchism">
+Anarchism
+
+Anarchism is a political philosophy and movement that rejects all involuntary, coercive forms of hierarchy. It radically calls for the abolition of the state which it holds to be undesirable, unnecessary, and harmful.
+
+The history of anarchism stretches back to prehistory, when humans lived in anarchistic societies long before the establishment of formal states, realms or empires. With the rise of organised hierarchical bodies, skepticism toward authority also rose, but it was not until the 19th century that a self-conscious political movement emerged. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in worker's struggles for emancipation. Various anarchist schools of thought formed during this period.
+
+Anarchists took part in several revolutions, most notably in the Spanish Civil War, where they were crushed along with the alliance to restore the Second Republic by the fascist forces of the Nationalist faction and its foreign allies in Nazi Germany, Fascist Italy, Portuguese Dictatorship and the Catholic Church in 1939, marking the end of the classical era of anarchism. In the last decades of the 20th century and into the 21st century, the anarchist movement has been resurgent once more.
+
+Anarchism employs various tactics in order to meet its ideal ends; these can be broadly separated into revolutionary and evolutionary tactics. There is significant overlap between the two, which are merely descriptive. Revolutionary tactics aim to bring down authority and state, and have taken a violent turn in the past. Evolutionary tactics aim to prefigure what an anarchist society would be like. Anarchist thought, criticism, and praxis has played a part in diverse areas of human society.
+
+The etymological origin of "anarchism" is from the Ancient Greek "anarkhia", meaning "without a ruler", composed of the prefix "an-" (i.e. "without") and the word "arkhos" (i.e. "leader" or "ruler"). The suffix "-ism" denotes the ideological current that favours anarchy. "Anarchism" appears in English from 1642 as "anarchisme" and "anarchy" from 1539. Various factions within the French Revolution labelled their opponents as "anarchists", although few such accused shared many views with later anarchists. Many revolutionaries of the 19th century such as William Godwin (1756–1836) and Wilhelm Weitling (1808–1871) would contribute to the anarchist doctrines of the next generation, but they did not use "anarchist" or "anarchism" in describing themselves or their beliefs.
+
+The first political philosopher to call himself an "anarchist" () was Pierre-Joseph Proudhon (1809–1865), marking the formal birth of anarchism in the mid-19th century. Since the 1890s and beginning in France, "libertarianism" has often been used as a synonym for anarchism and its use as a synonym is still common outside the United States. On the other hand, some use "libertarianism" to refer to individualistic free-market philosophy only, referring to free-market anarchism as "libertarian anarchism".
+
+While opposition to the state is central to anarchist thought, defining anarchism is not an easy task as there is a lot of discussion among scholars and anarchists on the matter and various currents perceive anarchism slightly differently. Hence, it might be true to say that anarchism is a cluster of political philosophies opposing authority and hierarchical organization (including the state, capitalism, nationalism and all associated institutions) in the conduct of all human relations in favour of a society based on voluntary association, on freedom and on decentralisation, but this definition has the same shortcomings as the definition based on etymology (which is simply a negation of a ruler), or based on anti-statism (anarchism is much more than that) or even the anti-authoritarian (which is an "a posteriori" conclusion). Nonetheless, major elements of the definition of anarchism include the following:
+
+During the prehistoric era of mankind, an established authority did not exist. It was after the creation of towns and cities that institutions of authority were established and anarchistic ideas espoused as a reaction. Most notable precursors to anarchism in the ancient world were in China and Greece. In China, philosophical anarchism (i.e. the discussion on the legitimacy of the state) was delineated by Taoist philosophers Zhuang Zhou and Laozi.
+
+Likewise, anarchic attitudes were articulated by tragedians and philosophers in Greece. Aeschylus and Sophocles used the myth of Antigone to illustrate the conflict between rules set by the state and personal autonomy. Socrates questioned Athenian authorities constantly and insisted to the right of individual freedom of consciousness. Cynics dismissed human law ("nomos") and associated authorities while trying to live according to nature ("physis"). Stoics were supportive of a society based on unofficial and friendly relations among its citizens without the presence of a state.
+
+During the Middle Ages, there was no anarchistic activity except some ascetic religious movements in the Muslim world or in Christian Europe. This kind of tradition later gave birth to religious anarchism. In the Sasanian Empire, Mazdak called for an egalitarian society and the abolition of monarchy, only to be soon executed by Emperor Kavad I.
+
+In Basra, religious sects preached against the state. In Europe, various sects developed anti-state and libertarian tendencies. Libertarian ideas further emerged during the Renaissance with the spread of reasoning and humanism through Europe. Novelists fictionalised ideal societies that were based not on coercion but voluntarism. The Enlightenment further pushed towards anarchism with the optimism for social progress.
+
+During the French Revolution, partisan groups such as the Enragés and the saw a turning point in the fermentation of anti-state and federalist sentiments. The first anarchist currents developed throughout the 18th century—William Godwin espoused philosophical anarchism in England, morally delegitimizing the state, Max Stirner's thinking paved the way to individualism, and Pierre-Joseph Proudhon's theory of mutualism found fertile soil in France. This era of classical anarchism lasted until the end of the Spanish Civil War of 1936 and is considered the golden age of anarchism.
+Drawing from mutualism, Mikhail Bakunin founded collectivist anarchism and entered the International Workingmen's Association, a class worker union later known as the First International that formed in 1864 to unite diverse revolutionary currents. The International became a significant political force, with Karl Marx being a leading figure and a member of its General Council. Bakunin's faction (the Jura Federation) and Proudhon's followers (the mutualists) opposed Marxist state socialism, advocating political abstentionism and small property holdings. After bitter disputes, the Bakuninists were expelled from the International by the Marxists at the 1872 Hague Congress. Bakunin famously predicted that if revolutionaries gained power by Marx's terms, they would end up the new tyrants of workers. After being expelled, anarchists formed the St. Imier International. Under the influence of Peter Kropotkin, a Russian philosopher and scientist, anarcho-communism overlapped with collectivism. Anarcho-communists, who drew inspiration from the 1871 Paris Commune, advocated for free federation and for the distribution of goods according to one's needs.
+
+At the turn of the century, anarchism had spread all over the world. In China, small groups of students imported the humanistic pro-science version of anarcho-communism. Tokyo was a hotspot for rebellious youth from countries of the far east, travelling to the Japanese capital to study. In Latin America, Argentina was a stronghold for anarcho-syndicalism, where it became the most prominent left-wing ideology. During this time, a minority of anarchists adopted tactics of revolutionary political violence. This strategy became known as propaganda of the deed. The dismemberment of the French socialist movement into many groups, and the execution and exile of many Communards to penal colonies following the suppression of the Paris Commune, favoured individualist political expression and acts. Even though many anarchists distanced themselves from these terrorist acts, infamy came upon the movement. Illegalism was another strategy which some anarchists adopted during this period.
+Anarchists enthusiastically participated in the Russian Revolution—despite concerns—in opposition to the Whites. However, they met harsh suppression after the Bolshevik government was stabilized. Several anarchists from Petrograd and Moscow fled to Ukraine, notably leading to the Kronstadt rebellion and Nestor Makhno's struggle in the Free Territory. With the anarchists being crushed in Russia, two new antithetical currents emerged, namely platformism and synthesis anarchism. The former sought to create a coherent group that would push for revolution while the latter were against anything that would resemble a political party. Seeing the victories of the Bolsheviks in the October Revolution and the resulting Russian Civil War, many workers and activists turned to communist parties, which grew at the expense of anarchism and other socialist movements. In France and the United States, members of major syndicalist movements, the General Confederation of Labour and Industrial Workers of the World, left their organisations and joined the Communist International.
+
+In the Spanish Civil War, anarchists and syndicalists (CNT and FAI) once again allied themselves with various currents of leftists. A long tradition of Spanish anarchism led to anarchists playing a pivotal role in the war. In response to the army rebellion, an anarchist-inspired movement of peasants and workers, supported by armed militias, took control of Barcelona and of large areas of rural Spain, where they collectivised the land. The Soviet Union provided some limited assistance at the beginning of the war, but the result was a bitter fight among communists and anarchists at a series of events named May Days as Joseph Stalin tried to seize control of the Republicans.
+
+At the end of World War II, the anarchist movement was severely weakened. However, the 1960s witnessed a revival of anarchism likely caused by a perceived failure of Marxism–Leninism and tensions built by the Cold War. During this time, anarchism took root in other movements critical towards both the state and capitalism, such as the anti-nuclear, environmental and pacifist movements, the New Left, and the counterculture of the 1960s. Anarchism became associated with punk subculture, as exemplified by bands such as Crass and the Sex Pistols, and the established feminist tendencies of anarcha-feminism returned with vigour during the second wave of feminism.
+
+Around the turn of the 21st century, anarchism grew in popularity and influence within anti-war, anti-capitalist, and anti-globalisation movements. Anarchists became known for their involvement in protests against the World Trade Organization, the Group of Eight and the World Economic Forum. During the protests, "ad hoc" leaderless anonymous cadres known as black blocs engaged in rioting, property destruction, and violent confrontations with the police. Other organisational tactics pioneered in this time include security culture, affinity groups, and the use of decentralised technologies such as the internet. A significant event of this period was the confrontations at the WTO conference in Seattle in 1999. Anarchist ideas have been influential in the development of the Zapatistas in Mexico and the Democratic Federation of Northern Syria, more commonly known as Rojava, a "de facto" autonomous region in northern Syria.
+
+Anarchist schools of thought have been generally grouped into two main historical traditions, social anarchism and individualist anarchism, owing to their different origins, values and evolution. The individualist current emphasises negative liberty in opposing restraints upon the free individual, while the social current emphasises positive liberty in aiming to achieve the free potential of society through equality and social ownership. In a chronological sense, anarchism can be segmented by the classical currents of the late 19th century, and the post-classical currents (such as anarcha-feminism, green anarchism and post-anarchism) developed thereafter.
+
+Beyond the specific factions of anarchist movements which constitute political anarchism lies philosophical anarchism, which holds that the state lacks moral legitimacy, without necessarily accepting the imperative of revolution to eliminate it. A component especially of individualist anarchism, philosophical anarchism may tolerate the existence of a minimal state, but argues that citizens have no moral obligation to obey government when it conflicts with individual autonomy. Anarchism pays significant attention to moral arguments since ethics have a central role in anarchist philosophy.
+
+One reaction against sectarianism within the anarchist milieu was anarchism without adjectives, a call for toleration and unity among anarchists first adopted by Fernando Tarrida del Mármol in 1889 in response to the bitter debates of anarchist theory at the time. Despite separation, the various anarchist schools of thought are not seen as distinct entities, but as tendencies that intermingle.
+
+Anarchism is usually placed on the far-left of the political spectrum. Much of its economics and legal philosophy reflect anti-authoritarian, anti-statist, and libertarian interpretations of the radical left-wing and socialist politics of collectivism, communism, individualism, mutualism, and syndicalism, among other libertarian socialist economic theories. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist, and varieties of anarchy diverge widely.
+
+Inceptive currents among classical anarchist currents were mutualism and individualism. They were followed by the major currents of social anarchism (collectivist, communist, and syndicalist). They differ on organizational and economic aspects of their ideal society.
+
+Mutualism is an 18th-century economic theory that was developed into anarchist theory by Pierre-Joseph Proudhon. Its aims include reciprocity, free association, voluntary contract, federation, and credit and currency reform that would be regulated by a bank of the people. Mutualism has been retrospectively characterised as ideologically situated between individualist and collectivist forms of anarchism. Proudhon first characterised his goal as a "third form of society, the synthesis of communism and property".
+
+Collectivist anarchism, also known as anarchist collectivism or anarcho-collectivism, is a revolutionary socialist form of anarchism commonly associated with Mikhail Bakunin. Collectivist anarchists advocate collective ownership of the means of production, theorised to be achieved through violent revolution, and that workers be paid according to time worked, rather than goods being distributed according to need as in communism. Collectivist anarchism arose alongside Marxism, but rejected the dictatorship of the proletariat despite the stated Marxist goal of a collectivist stateless society. Anarcho-communism, also known as anarchist-communism, communist anarchism, and libertarian communism, is a theory of anarchism that advocates a communist society with common ownership of the means of production, direct democracy, and a horizontal network of voluntary associations and workers' councils with production and consumption based on the guiding principle: "From each according to his ability, to each according to his need". Anarcho-communism developed from radical socialist currents after the French Revolution, but it was first formulated as such in the Italian section of the First International. It was later expanded upon in the theoretical work of Peter Kropotkin.
+
+Anarcho-syndicalism, also referred to as revolutionary syndicalism, is a branch of anarchism that views labour syndicates as a potential force for revolutionary social change, replacing capitalism and the state with a new society democratically self-managed by workers. The basic principles of anarcho-syndicalism are workers' solidarity, direct action, and workers' self-management.
+
+Individualist anarchism refers to several traditions of thought within the anarchist movement that emphasise the individual and their will over any kinds of external determinants. Early influences on individualist forms of anarchism include William Godwin, Max Stirner and Henry David Thoreau. Through many countries, individualist anarchism attracted a small yet diverse following of Bohemian artists and intellectuals as well as young anarchist outlaws in what became known as illegalism and individual reclamation.
+
+Anarchist principles undergird contemporary radical social movements of the left. Interest in the anarchist movement developed alongside momentum in the anti-globalization movement, whose leading activist networks were anarchist in orientation. As the movement shaped 21st century radicalism, wider embrace of anarchist principles signaled a revival of interest. Contemporary news coverage which emphasizes black bloc demonstrations has reinforced anarchism's historical association with chaos and violence, although its publicity has also led more scholars to engage with the anarchist movement. Anarchism has continued to generate many philosophies and movements—at times eclectic, drawing upon various sources, and syncretic, combining disparate concepts to create new philosophical approaches. The anti-capitalist tradition of classical anarchism has remained prominent within contemporary currents.
+
+Various anarchist groups, tendencies, and schools of thought exist today, making it difficult to describe contemporary anarchist movement. While theorists and activists have established "relatively stable constellations of anarchist principles", there is no consensus on which principles are core. As a result, commentators describe multiple "anarchisms" (rather than a singular "anarchism") in which common principles are shared between schools of anarchism while each group prioritizes those principles differently. For example, gender equality can be a common principle but ranks as a higher priority to anarcha-feminists than anarchist communists. Anarchists are generally committed against coercive authority in all forms, namely "all centralized and hierarchical forms of government (e.g., monarchy, representative democracy, state socialism, etc.), economic class systems (e.g., capitalism, Bolshevism, feudalism, slavery, etc.), autocratic religions (e.g., fundamentalist Islam, Roman Catholicism, etc.), patriarchy, heterosexism, white supremacy, and imperialism". However, anarchist schools disagree on the methods by which these forms should be opposed.
+
+Anarchists' tactics take various forms but in general serve two major goals—first, to oppose the Establishment; and second, to promote anarchist ethics and reflect an anarchist vision of society, illustrating the unity of means and ends. A broad categorization can be made between aims to destroy oppressive states and institutions by revolutionary means, and aims to change society through evolutionary means. Evolutionary tactics reject violence and take a gradual approach to anarchist aims, though there is significant overlap between the two.
+
+Anarchist tactics have shifted during the course of the last century. Anarchists during the early 20th century focused more on strikes and militancy, while contemporary anarchists use a broader array of approaches.
+
+During the classical era, anarchists had a militant tendency. Not only did they confront state armed forces (as in Spain and Ukraine) but some of them also employed terrorism as propaganda of the deed. Assassination attempts were carried out against heads of state, some of which were successful. Anarchists also took part in revolutions. Anarchist perspectives towards violence have always been perplexing and controversial. On one hand, anarcho-pacifists point out the unity of means and ends. On the other hand, other anarchist groups advocate direct action, a tactic which can include acts of sabotage or even acts of terrorism. This attitude was quite prominent a century ago; seeing the state as a tyrant, some anarchists believed that they had every right to oppose its oppression by any means possible. Emma Goldman and Errico Malatesta, who were proponents of limited use of violence, argued that violence is merely a reaction to state violence as a necessary evil.
+
+Anarchists took an active role in strikes, although they tended to be antipathetic to formal syndicalism, seeing it as reformist. They saw it as a part of the movement which sought to overthrow the state and capitalism. Anarchists also reinforced their propaganda within the arts, some of whom practiced nudism. They also built communities which were based on friendship. They were also involved in the press.
+
+In the current era, Italian anarchist Alfredo Bonanno, a proponent of insurrectionary anarchism, has reinstated the debate on violence by rejecting the nonviolence tactic adopted since the late 19th century by Kropotkin and other prominent anarchists afterwards. Both Bonanno and the French group The Invisible Committee advocate for small, informal affiliation groups, where each member is responsible for their own actions but works together to bring down oppression utilizing sabotage and other violent means against state, capitalism and other enemies. Members of The Invisible Committee were arrested in 2008 on various charges, terrorism included.
+
+Overall, today's anarchists are much less violent and militant than their ideological ancestors. They mostly engage in confronting the police during demonstrations and riots, especially in countries like Canada, Mexico or Greece. Μilitant black bloc protest groups are known for clashing with the police. However, anarchists not only clash with state operators; they also engage in the struggle against fascists and racists, taking anti-fascist action and mobilizing to prevent hate rallies from happening.
+
+Anarchists commonly employ direct action. This can take the form of disrupting and protesting against unjust hierarchy, or the form of self-managing their lives through the creation of counter-institutions such as communes and non-hierarchical collectives. Often, decision-making is handled in an anti-authoritarian way, with everyone having equal say in each decision, an approach known as horizontalism. Contemporary-era anarchists have been engaging with various grassroots movements that are not explicitly anarchist but are more or less based on horizontalism, respecting personal autonomy, and participating in mass activism such as strikes and demonstrations. The newly coined term "small-a anarchism", in contrast with the "big-A anarchism" of the classical era, signals their tendency not to base their thoughts and actions on classical-era anarchism or to refer to Kropotkin or Proudhon to justify their opinions. They would rather base their thought and praxis on their own experience, which they will later theorize.
+
+The decision-making process of small affinity anarchist groups play a significant tactical role. Anarchists have employed various methods in order to build a rough consensus among members of their group, without the need of a leader or a leading group. One way is for an individual from the group to play the role of facilitator to help achieve a consensus without taking part in the discussion themselves or promoting a specific point. Minorities usually accept rough consensus, except when they feel the proposal contradicts anarchist goals, values, or ethics. Anarchists usually form small groups (5–20 individuals) to enhance autonomy and friendships among their members. These kind of groups more often than not interconnect with each other, forming larger networks. Anarchists still support and participate in strikes, especially wildcat strikes; these are leaderless strikes not organised centrally by a syndicate.
+
+Anarchists have gone online to spread their message. As in the past, newspapers and journals are used; however, because of distributional and other difficulties, anarchists have found it easier to create websites, hosting electronic libraries and other portals. Anarchists were also involved in developing various software that are available for free. The way these hacktivists work to develop and distribute resembles the anarchist ideals, especially when it comes to preserving user's privacy from state surveillance.
+
+Anarchists organize themselves to squat and reclaim public spaces. During important events such as protests and when spaces are being occupied, they are often called Temporary Autonomous Zones (TAZ), spaces where surrealism, poetry and art are blended to display the anarchist ideal. As seen by anarchists, squatting is a way to regain urban space from the capitalist market, serving pragmatical needs, and is also seen an exemplary direct action. Acquiring space enables anarchists to experiment with their ideas and build social bonds. Adding up these tactics, and having in mind that not all anarchists share the same attitudes towards them, along with various forms of protesting at highly symbolic events, make up a carnivalesque atmosphere that is part of contemporary anarchist vividity.
+
+As anarchism is a philosophy that embodies many diverse attitudes, tendencies, and schools of thought, and disagreement over questions of values, ideology, and tactics is common, its diversity has led to widely different uses of identical terms among different anarchist traditions, which has created a number of definitional concerns in anarchist theory. For instance, the compatibility of capitalism, nationalism and religion with anarchism is widely disputed. Similarly, anarchism enjoys complex relationships with ideologies such as Marxism, communism, collectivism and trade unionism. Anarchists may be motivated by humanism, divine authority, enlightened self-interest, veganism, or any number of alternative ethical doctrines. Phenomena such as civilisation, technology (e.g. within anarcho-primitivism) and the democratic process may be sharply criticised within some anarchist tendencies and simultaneously lauded in others.
+
+Gender and sexuality carry along them dynamics of hierarchy; anarchism is obliged to address, analyse and oppose the suppression of one's autonomy because of the dynamics that gender roles traditionally impose.
+
+A historical current that arose and flourished during 1890 and 1920 within anarchism was free love; in contemporary anarchism, this current survives as a tendency to support polyamory and queer anarchism. Free love advocates were against marriage, which they saw as a way of men imposing authority over women, largely because marriage law greatly favoured the power of men. The notion of free love, though, was much broader; it included critique of the established order that limited women's sexual freedom and pleasure. Such free love movements contributed to the establishment of communal houses, where large groups of travelers, anarchists, and other activists slept in beds together. Free love had roots both in Europe and the United States. Some anarchists, however, struggled with the jealousy that arose from free love. Anarchist feminists were advocates of free love, against marriage, were pro-choice (utilizing a contemporary term) and had a likewise agenda. Anarchist and non-anarchist feminists differed on suffrage, but were nonetheless supportive of one another.
+
+During the second half of the 20th century, anarchism intermingled with the second wave of feminism, radicalizing some currents of the feminist movement (and being influenced as well). By the latest decades of the 20th century, anarchists and feminists were advocating for the rights and autonomy of women, gays, queers and other marginalized groups, with some feminist thinkers suggesting a fusion of the two currents. With the third wave of feminism, sexual identity and compulsory heterosexuality became a subject of study for anarchists, which yielded a post-structuralist critique of sexual normality. However, some anarchists distanced themselves from this line of thinking, suggesting that it leaned towards individualism and was, therefore, dropping the cause of social liberation.
+
+The interest of anarchists in education stretches back to the first emergence of classical anarchism. Anarchists consider 'proper' education, which sets the foundations of the future autonomy of the individual and the society, to be an act of mutual aid. Anarchist writers such as Willian Godwin and Max Stirner attacked both state education and private education as another means by which the ruling class replicate their privileges.
+
+In 1901, Catalan anarchist and free thinker Francisco Ferrer established the Escuela Moderna in Barcelona as an opposition to the established education system, which was dictated largely by the Catholic Church. Ferrer's approach was secular, rejecting both state and church involvement in the educational process, and gave pupils large amounts of autonomy in planning their work and attendance. Ferrer aimed to educate the working class and explicitly sought to foster class consciousness among students. The school closed after constant harassment by the state and Ferrer was later arrested. His ideas, however, formed the inspiration for a series of modern schools around the world. Christian anarchist Leo Tolstoy also established a similar school, with its founding principle, according to Tolstoy, being that "for education to be effective it had to be free". In a similar token, A. S. Neill founding what became Summerhill School in 1921, also declaring being free from coercion. 
+
+Anarchist education is based largely on the idea that a child's right to develop freely, without manipulation, ought to be respected, and that rationality will lead children to morally good conclusions. However, there has been little consensus among anarchist figures as to what constitutes manipulation; Ferrer, for example, believed that moral indoctrination was necessary and explicitly taught pupils that equality, liberty, and social justice were not possible under capitalism (along with other critiques of nationalism and government). 
+
+Late 20th century and contemporary anarchist writers (such as Colin Ward, Herbert Read and Paul Goodman) intensified and expanded the anarchist critique of state education, largely focusing on the need for a system that focuses on children's creativity rather than on their ability to attain a career or participate in consumer society. Contemporary anarchists, such as Colin Ward, have further argued that state education serves to perpetuate socio-economic inequality.
+
+While few anarchist education institutions have survived to the modern day, major tenets of anarchist schools, such as respect for child autonomy and relying on reasoning rather than indoctrination as a teaching method, have spread among mainstream educational institutions.
+
+Objection to the state and its institutions is a "sine qua non" of anarchism. Anarchists consider the state as a tool of domination and believe it to be illegitimate regardless of its political tendencies. Instead of people being able to control the aspects of their life, major decisions are taken by a small elite. Authority ultimately rests solely on power, regardless of whether that power is open or transparent, as it still has the ability to coerce people. Another anarchist argument against states is that the people constituting a government, even the most altruistic among officials, will unavoidably seek to gain more power, leading to corruption. Anarchists consider the idea that the state is the collective will of the people to be an unachievable fiction, due to the fact that the ruling class is distinct from the rest of society.
+
+The connection between anarchism and art was quite profound during the classical era of anarchism, especially among artistic currents that were developing during that era, such as futurists, surrealists, and others, while in literature anarchism was mostly associated with the New Apocalyptics and the Neo-romanticism movement. In music, anarchism has been associated with music scenes such as Punk. Anarchists such as Leo Tolstoy and Herbert Read argued that the border between the artist and the non-artist, what separates art from a daily act, is a construct produced by the alienation caused by capitalism, and it prevents humans from living a joyful life. 
+
+Other anarchists advocated for or used art as a means to achieve anarchist ends. In his book Breaking the Spell: A History of Anarchist Filmmakers, Videotape Guerrillas, and Digital Ninjas Chris Robé claims that "anarchist-inflected practices have increasingly structured movement-based video activism." 
+
+Three overlapping properties made art useful to anarchists: It could depict a critique of existing society and hierarchies; it could serve as a prefigurative tool to reflect the anarchist ideal society, and also it could turn into a means of direct action, in protests for example. As it appeals to both emotion and reason, art could appeal to the "whole human" and have a powerful effect.
+
+Philosophy lecturer Andrew G. Fiala has listed five main arguments against anarchism. Firstly, he notes that anarchism is related to violence and destruction, not only in the pragmatic world (i.e. at protests) but in the world of ethics as well. The second argument is that it is impossible for a society to function without a state or something like a state, acting to protect citizens from criminality. Fiala takes "Leviathan" from Thomas Hobbes and the night-watchman state from philosopher Robert Nozick as examples. Thirdly, anarchism is evaluated as unfeasible or utopian since the state can not be defeated practically; this line of arguments most often calls for political action within the system to reform it. The fourth argument is that anarchism is self-contradictory since while it advocates for no-one to "archiei", if accepted by the many, then anarchism will turn into the ruling political theory. In this line of criticism also comes the self contradiction that anarchist calls for collective action while anarchism endorses the autonomy of the individual and hence no collective action can be taken. Lastly, Fiala mentions a critique towards philosophical anarchism, of being ineffective (all talk and thoughts) and in the meantime capitalism and bourgeois class remains strong.
+
+Philosophical anarchism has met the criticism of members of academia, following the release of pro-anarchist books such as A. John Simmons' "Moral Principles and Political Obligations" (1979). Law professor William A. Edmundson authored an essay arguing against three major philosophical anarchist principles, which he finds fallacious; Edmundson claims that while the individual does not owe a normal state a duty of obedience, this does not imply that anarchism is the inevitable conclusion, and the state is still morally legitimate.
+
+
+
+
+
+
+
+</doc>
+<doc id="25" url="https://en.wikipedia.org/wiki?curid=25" title="Autism">
+Autism
+
+Autism is a developmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior. Parents often notice signs during the first three years of their child's life. These signs often develop gradually, though some children with autism experience worsening in their communication and social skills after reaching developmental milestones at a normal pace.
+Autism is associated with a combination of genetic and environmental factors. Risk factors during pregnancy include certain infections, such as rubella, toxins including valproic acid, alcohol, cocaine, pesticides, lead, and air pollution, fetal growth restriction, and autoimmune diseases. Controversies surround other proposed environmental causes; for example, the vaccine hypothesis, which has been disproven. Autism affects information processing in the brain and how nerve cells and their synapses connect and organize; how this occurs is not well understood. The Diagnostic and Statistical Manual of Mental Disorders (DSM-5), combines autism and less severe forms of the condition, including Asperger syndrome and pervasive developmental disorder not otherwise specified (PDD-NOS) into the diagnosis of autism spectrum disorder (ASD).
+Early behavioral interventions or speech therapy can help children with autism gain self-care, social, and communication skills. Although there is no known cure, there have been cases of children who recovered. Some autistic adults are unable to live independently. An autistic culture has developed, with some individuals seeking a cure and others believing autism should be accepted as a difference to be accommodated instead of cured.
+Globally, autism is estimated to affect 24.8 million people . In the 2000s, the number of people affected was estimated at 1–2 per 1,000 people worldwide. In the developed countries, about 1.5% of children are diagnosed with ASD , from 0.7% in 2000 in the United States. It occurs four-to-five times more often in males than females. The number of people diagnosed has increased dramatically since the 1960s, which may be partly due to changes in diagnostic practice. The question of whether actual rates have increased is unresolved.
+Autism is a highly variable, neurodevelopmental disorder whose symptoms first appears during infancy or childhood, and generally follows a steady course without remission. People with autism may be severely impaired in some respects but average, or even superior, in others. Overt symptoms gradually begin after the age of six months, become established by age two or three years and tend to continue through adulthood, although often in more muted form. It is distinguished by a characteristic triad of symptoms: impairments in social interaction, impairments in communication, and repetitive behavior. Other aspects, such as atypical eating, are also common but are not essential for diagnosis. Individual symptoms of autism occur in the general population and appear not to associate highly, without a sharp line separating pathologically severe from common traits.
+
+Social deficits distinguish autism and the related autism spectrum disorders (ASD; see Classification) from other developmental disorders. People with autism have social impairments and often lack the intuition about others that many people take for granted. Noted autistic Temple Grandin described her inability to understand the social communication of neurotypicals, or people with typical neural development, as leaving her feeling "like an anthropologist on Mars".
+
+Unusual social development becomes apparent early in childhood. Autistic infants show less attention to social stimuli, smile and look at others less often, and respond less to their own name. Autistic toddlers differ more strikingly from social norms; for example, they have less eye contact and turn-taking, and do not have the ability to use simple movements to express themselves, such as pointing at things. Three- to five-year-old children with autism are less likely to exhibit social understanding, approach others spontaneously, imitate and respond to emotions, communicate nonverbally, and take turns with others. However, they do form attachments to their primary caregivers. Most children with autism display moderately less attachment security than neurotypical children, although this difference disappears in children with higher mental development or less pronounced autistic traits. Older children and adults with ASD perform worse on tests of face and emotion recognition although this may be partly due to a lower ability to define a person's own emotions.
+
+Children with high-functioning autism have more intense and frequent loneliness compared to non-autistic peers, despite the common belief that children with autism prefer to be alone. Making and maintaining friendships often proves to be difficult for those with autism. For them, the quality of friendships, not the number of friends, predicts how lonely they feel. Functional friendships, such as those resulting in invitations to parties, may affect the quality of life more deeply.
+There are many anecdotal reports, but few systematic studies, of aggression and violence in individuals with ASD. The limited data suggest that, in children with intellectual disability, autism is associated with aggression, destruction of property, and meltdowns.
+
+About a third to a half of individuals with autism do not develop enough natural speech to meet their daily communication needs. Differences in communication may be present from the first year of life, and may include delayed onset of babbling, unusual gestures, diminished responsiveness, and vocal patterns that are not synchronized with the caregiver. In the second and third years, children with autism have less frequent and less diverse babbling, consonants, words, and word combinations; their gestures are less often integrated with words. Children with autism are less likely to make requests or share experiences, and are more likely to simply repeat others' words (echolalia) or reverse pronouns. Joint attention seems to be necessary for functional speech, and deficits in joint attention seem to distinguish infants with ASD. For example, they may look at a pointing hand instead of the pointed-at object, and they consistently fail to point at objects in order to comment on or share an experience. Children with autism may have difficulty with imaginative play and with developing symbols into language.
+
+In a pair of studies, high-functioning children with autism aged 8–15 performed equally well as, and as adults better than, individually matched controls at basic language tasks involving vocabulary and spelling. Both autistic groups performed worse than controls at complex language tasks such as figurative language, comprehension and inference. As people are often sized up initially from their basic language skills, these studies suggest that people speaking to autistic individuals are more likely to overestimate what their audience comprehends.
+
+Autistic individuals can display many forms of repetitive or restricted behavior, which the Repetitive Behavior Scale-Revised (RBS-R) categorizes as follows.
+
+
+No single repetitive or self-injurious behavior seems to be specific to autism, but autism appears to have an elevated pattern of occurrence and severity of these behaviors.
+
+Autistic individuals may have symptoms that are independent of the diagnosis, but that can affect the individual or the family.
+An estimated 0.5% to 10% of individuals with ASD show unusual abilities, ranging from splinter skills such as the memorization of trivia to the extraordinarily rare talents of prodigious autistic savants. Many individuals with ASD show superior skills in perception and attention, relative to the general population. Sensory abnormalities are found in over 90% of those with autism, and are considered core features by some, although there is no good evidence that sensory symptoms differentiate autism from other developmental disorders. Differences are greater for under-responsivity (for example, walking into things) than for over-responsivity (for example, distress from loud noises) or for sensation seeking (for example, rhythmic movements). An estimated 60–80% of autistic people have motor signs that include poor muscle tone, poor motor planning, and toe walking; deficits in motor coordination are pervasive across ASD and are greater in autism proper. Unusual eating behavior occurs in about three-quarters of children with ASD, to the extent that it was formerly a diagnostic indicator. Selectivity is the most common problem, although eating rituals and food refusal also occur.
+
+There is tentative evidence that autism occurs more frequently in people with gender dysphoria.
+
+Gastrointestinal problems are one of the most commonly associated medical disorders in people with autism. These are linked to greater social impairment, irritability, behavior and sleep problems, language impairments and mood changes.
+
+Parents of children with ASD have higher levels of stress. Siblings of children with ASD report greater admiration of and less conflict with the affected sibling than siblings of unaffected children and were similar to siblings of children with Down syndrome in these aspects of the sibling relationship. However, they reported lower levels of closeness and intimacy than siblings of children with Down syndrome; siblings of individuals with ASD have greater risk of negative well-being and poorer sibling relationships as adults.
+
+It has long been presumed that there is a common cause at the genetic, cognitive, and neural levels for autism's characteristic triad of symptoms. However, there is increasing suspicion that autism is instead a complex disorder whose core aspects have distinct causes that often co-occur.
+Autism has a strong genetic basis, although the genetics of autism are complex and it is unclear whether ASD is explained more by rare mutations with major effects, or by rare multigene interactions of common genetic variants. Complexity arises due to interactions among multiple genes, the environment, and epigenetic factors which do not change DNA sequencing but are heritable and influence gene expression. Many genes have been associated with autism through sequencing the genomes of affected individuals and their parents. Studies of twins suggest that heritability is 0.7 for autism and as high as 0.9 for ASD, and siblings of those with autism are about 25 times more likely to be autistic than the general population. However, most of the mutations that increase autism risk have not been identified. Typically, autism cannot be traced to a Mendelian (single-gene) mutation or to a single chromosome abnormality, and none of the genetic syndromes associated with ASDs have been shown to selectively cause ASD. Numerous candidate genes have been located, with only small effects attributable to any particular gene. Most loci individually explain less than 1% of cases of autism. The large number of autistic individuals with unaffected family members may result from spontaneous structural variation—such as deletions, duplications or inversions in genetic material during meiosis. Hence, a substantial fraction of autism cases may be traceable to genetic causes that are highly heritable but not inherited: that is, the mutation that causes the autism is not present in the parental genome. Autism may be underdiagnosed in women and girls due to an assumption that it is primarily a male condition, but genetic phenomena such as imprinting and X linkage have the ability to raise the frequency and severity of conditions in males, and theories have been put forward for a genetic reason why males are diagnosed more often, such as the imprinted brain theory and the extreme male brain theory.
+
+Maternal nutrition and inflammation during preconception and pregnancy influences fetal neurodevelopment. Intrauterine growth restriction is associated with ASD, in both term and preterm infants. Maternal inflammatory and autoimmune diseases may damage fetal tissues, aggravating a genetic problem or damaging the nervous system.
+
+Exposure to air pollution during pregnancy, especially heavy metals and particulates, may increase the risk of autism. Environmental factors that have been claimed without evidence to contribute to or exacerbate autism include certain foods, infectious diseases, solvents, PCBs, phthalates and phenols used in plastic products, pesticides, brominated flame retardants, alcohol, smoking, illicit drugs, vaccines, and prenatal stress. Some, such as the MMR vaccine, have been completely disproven.
+
+Parents may first become aware of autistic symptoms in their child around the time of a routine vaccination. This has led to unsupported theories blaming vaccine "overload", a vaccine preservative, or the MMR vaccine for causing autism. The latter theory was supported by a litigation-funded study that has since been shown to have been "an elaborate fraud". Although these theories lack convincing scientific evidence and are biologically implausible, parental concern about a potential vaccine link with autism has led to lower rates of childhood immunizations, outbreaks of previously controlled childhood diseases in some countries, and the preventable deaths of several children.
+
+Autism's symptoms result from maturation-related changes in various systems of the brain. How autism occurs is not well understood. Its mechanism can be divided into two areas: the pathophysiology of brain structures and processes associated with autism, and the neuropsychological linkages between brain structures and behaviors. The behaviors appear to have multiple pathophysiologies.
+
+There is evidence that gut–brain axis abnormalities may be involved. A 2015 review proposed that immune dysregulation, gastrointestinal inflammation, malfunction of the autonomic nervous system, gut flora alterations, and food metabolites may cause brain neuroinflammation and dysfunction. A 2016 review concludes that enteric nervous system abnormalities might play a role in neurological disorders such as autism. Neural connections and the immune system are a pathway that may allow diseases originated in the intestine to spread to the brain.
+
+Several lines of evidence point to synaptic dysfunction as a cause of autism. Some rare mutations may lead to autism by disrupting some synaptic pathways, such as those involved with cell adhesion. Gene replacement studies in mice suggest that autistic symptoms are closely related to later developmental steps that depend on activity in synapses and on activity-dependent changes. All known teratogens (agents that cause birth defects) related to the risk of autism appear to act during the first eight weeks from conception, and though this does not exclude the possibility that autism can be initiated or affected later, there is strong evidence that autism arises very early in development.
+
+Diagnosis is based on behavior, not cause or mechanism. Under the DSM-5, autism is characterized by persistent deficits in social communication and interaction across multiple contexts, as well as restricted, repetitive patterns of behavior, interests, or activities. These deficits are present in early childhood, typically before age three, and lead to clinically significant functional impairment. Sample symptoms include lack of social or emotional reciprocity, stereotyped and repetitive use of language or idiosyncratic language, and persistent preoccupation with unusual objects. The disturbance must not be better accounted for by Rett syndrome, intellectual disability or global developmental delay. ICD-10 uses essentially the same definition.
+
+Several diagnostic instruments are available. Two are commonly used in autism research: the Autism Diagnostic Interview-Revised (ADI-R) is a semistructured parent interview, and the Autism Diagnostic Observation Schedule (ADOS) uses observation and interaction with the child. The Childhood Autism Rating Scale (CARS) is used widely in clinical environments to assess severity of autism based on observation of children. The Diagnostic interview for social and communication disorders (DISCO) may also be used.
+
+A pediatrician commonly performs a preliminary investigation by taking developmental history and physically examining the child. If warranted, diagnosis and evaluations are conducted with help from ASD specialists, observing and assessing cognitive, communication, family, and other factors using standardized tools, and taking into account any associated medical conditions. A pediatric neuropsychologist is often asked to assess behavior and cognitive skills, both to aid diagnosis and to help recommend educational interventions. A differential diagnosis for ASD at this stage might also consider intellectual disability, hearing impairment, and a specific language impairment such as Landau–Kleffner syndrome. The presence of autism can make it harder to diagnose coexisting psychiatric disorders such as depression.
+
+Clinical genetics evaluations are often done once ASD is diagnosed, particularly when other symptoms already suggest a genetic cause. Although genetic technology allows clinical geneticists to link an estimated 40% of cases to genetic causes, consensus guidelines in the US and UK are limited to high-resolution chromosome and fragile X testing. A genotype-first model of diagnosis has been proposed, which would routinely assess the genome's copy number variations. As new genetic tests are developed several ethical, legal, and social issues will emerge. Commercial availability of tests may precede adequate understanding of how to use test results, given the complexity of autism's genetics. Metabolic and neuroimaging tests are sometimes helpful, but are not routine.
+
+ASD can sometimes be diagnosed by age 14 months, although diagnosis becomes increasingly stable over the first three years of life: for example, a one-year-old who meets diagnostic criteria for ASD is less likely than a three-year-old to continue to do so a few years later. In the UK the National Autism Plan for Children recommends at most 30 weeks from first concern to completed diagnosis and assessment, though few cases are handled that quickly in practice. Although the symptoms of autism and ASD begin early in childhood, they are sometimes missed; years later, adults may seek diagnoses to help them or their friends and family understand themselves, to help their employers make adjustments, or in some locations to claim disability living allowances or other benefits. Girls are often diagnosed later than boys.
+
+Underdiagnosis and overdiagnosis are problems in marginal cases, and much of the recent increase in the number of reported ASD cases is likely due to changes in diagnostic practices. The increasing popularity of drug treatment options and the expansion of benefits has given providers incentives to diagnose ASD, resulting in some overdiagnosis of children with uncertain symptoms. Conversely, the cost of screening and diagnosis and the challenge of obtaining payment can inhibit or delay diagnosis. It is particularly hard to diagnose autism among the visually impaired, partly because some of its diagnostic criteria depend on vision, and partly because autistic symptoms overlap with those of common blindness syndromes or blindisms.
+
+Autism is one of the five pervasive developmental disorders (PDD), which are characterized by widespread abnormalities of social interactions and communication, and severely restricted interests and highly repetitive behavior. These symptoms do not imply sickness, fragility, or emotional disturbance.
+
+Of the five PDD forms, Asperger syndrome is closest to autism in signs and likely causes; Rett syndrome and childhood disintegrative disorder share several signs with autism, but may have unrelated causes; PDD not otherwise specified (PDD-NOS; also called "atypical autism") is diagnosed when the criteria are not met for a more specific disorder. Unlike with autism, people with Asperger syndrome have no substantial delay in language development. The terminology of autism can be bewildering, with autism, Asperger syndrome and PDD-NOS often called the "autism spectrum disorders" (ASD) or sometimes the "autistic disorders", whereas autism itself is often called "autistic disorder", "childhood autism", or "infantile autism". In this article, "autism" refers to the classic autistic disorder; in clinical practice, though, "autism", "ASD", and "PDD" are often used interchangeably. ASD, in turn, is a subset of the broader autism phenotype, which describes individuals who may not have ASD but do have autistic-like traits, such as avoiding eye contact.
+
+Autism can also be divided into syndromal and non-syndromal autism; the syndromal autism is associated with severe or profound intellectual disability or a congenital syndrome with physical symptoms, such as tuberous sclerosis. Although individuals with Asperger syndrome tend to perform better cognitively than those with autism, the extent of the overlap between Asperger syndrome, HFA, and non-syndromal autism is unclear.
+
+Some studies have reported diagnoses of autism in children due to a loss of language or social skills, as opposed to a failure to make progress, typically from 15 to 30 months of age. The validity of this distinction remains controversial; it is possible that regressive autism is a specific subtype, or that there is a continuum of behaviors between autism with and without regression.
+
+Research into causes has been hampered by the inability to identify biologically meaningful subgroups within the autistic population and by the traditional boundaries between the disciplines of psychiatry, psychology, neurology and pediatrics. Newer technologies such as fMRI and diffusion tensor imaging can help identify biologically relevant phenotypes (observable traits) that can be viewed on brain scans, to help further neurogenetic studies of autism; one example is lowered activity in the fusiform face area of the brain, which is associated with impaired perception of people versus objects. It has been proposed to classify autism using genetics as well as behavior.
+
+Autism has long been thought to cover a wide spectrum, ranging from individuals with severe impairments—who may be silent, developmentally disabled, and prone to frequent repetitive behavior such as hand flapping and rocking—to high functioning individuals who may have active but distinctly odd social approaches, narrowly focused interests, and verbose, pedantic communication. Because the behavior spectrum is continuous, boundaries between diagnostic categories are necessarily somewhat arbitrary. Sometimes the syndrome is divided into low-, medium- or high-functioning autism (LFA, MFA, and HFA), based on IQ thresholds. Some people have called for an end to the terms "high-functioning" and "low-functioning" due to lack of nuance and the potential for a person's needs or abilities to be overlooked.
+
+About half of parents of children with ASD notice their child's unusual behaviors by age 18 months, and about four-fifths notice by age 24 months. According to an article, failure to meet any of the following milestones "is an absolute indication to proceed with further evaluations. Delay in referral for such testing may delay early diagnosis and treatment and affect the long-term outcome".
+
+The United States Preventive Services Task Force in 2016 found it was unclear if screening was beneficial or harmful among children in whom there is no concerns. The Japanese practice is to screen all children for ASD at 18 and 24 months, using autism-specific formal screening tests. In contrast, in the UK, children whose families or doctors recognize possible signs of autism are screened. It is not known which approach is more effective. Screening tools include the Modified Checklist for Autism in Toddlers (M-CHAT), the Early Screening of Autistic Traits Questionnaire, and the First Year Inventory; initial data on M-CHAT and its predecessor, the Checklist for Autism in Toddlers (CHAT), on children aged 18–30 months suggests that it is best used in a clinical setting and that it has low sensitivity (many false-negatives) but good specificity (few false-positives). It may be more accurate to precede these tests with a broadband screener that does not distinguish ASD from other developmental disorders. Screening tools designed for one culture's norms for behaviors like eye contact may be inappropriate for a different culture. Although genetic screening for autism is generally still impractical, it can be considered in some cases, such as children with neurological symptoms and dysmorphic features.
+
+While infection with rubella during pregnancy causes fewer than 1% of cases of autism, vaccination against rubella can prevent many of those cases.
+
+The main goals when treating children with autism are to lessen associated deficits and family distress, and to increase quality of life and functional independence. In general, higher IQs are correlated with greater responsiveness to treatment and improved treatment outcomes. No single treatment is best and treatment is typically tailored to the child's needs. Families and the educational system are the main resources for treatment. Services should be carried out by behavior analysts, special education teachers, speech pathologists, and licensed psychologists. Studies of interventions have methodological problems that prevent definitive conclusions about efficacy. However, the development of evidence-based interventions has advanced in recent years. Although many psychosocial interventions have some positive evidence, suggesting that some form of treatment is preferable to no treatment, the methodological quality of systematic reviews of these studies has generally been poor, their clinical results are mostly tentative, and there is little evidence for the relative effectiveness of treatment options. Intensive, sustained special education programs and behavior therapy early in life can help children acquire self-care, communication, and job skills, and often improve functioning and decrease symptom severity and maladaptive behaviors; claims that intervention by around age three years is crucial are not substantiated. While medications have not been found to help with core symptoms, they may be used for associated symptoms, such as irritability, inattention, or repetitive behavior patterns.
+
+Educational interventions often used include applied behavior analysis (ABA), developmental models, structured teaching, speech and language therapy, social skills therapy, and occupational therapy. Among these approaches, interventions either treat autistic features comprehensively, or focalize treatment on a specific area of deficit. The quality of research for early intensive behavioral intervention (EIBI)—a treatment procedure incorporating over thirty hours per week of the structured type of ABA that is carried out with very young children—is currently low, and more vigorous research designs with larger sample sizes are needed. Two theoretical frameworks outlined for early childhood intervention include structured and naturalistic ABA interventions, and developmental social pragmatic models (DSP). One interventional strategy utilizes a parent training model, which teaches parents how to implement various ABA and DSP techniques, allowing for parents to disseminate interventions themselves. Various DSP programs have been developed to explicitly deliver intervention systems through at-home parent implementation. Despite the recent development of parent training models, these interventions have demonstrated effectiveness in numerous studies, being evaluated as a probable efficacious mode of treatment.
+
+Early, intensive ABA therapy has demonstrated effectiveness in enhancing communication and adaptive functioning in preschool children; it is also well-established for improving the intellectual performance of that age group. Similarly, a teacher-implemented intervention that utilizes a more naturalistic form of ABA combined with a developmental social pragmatic approach has been found to be beneficial in improving social-communication skills in young children, although there is less evidence in its treatment of global symptoms. Neuropsychological reports are often poorly communicated to educators, resulting in a gap between what a report recommends and what education is provided. It is not known whether treatment programs for children lead to significant improvements after the children grow up, and the limited research on the effectiveness of adult residential programs shows mixed results. The appropriateness of including children with varying severity of autism spectrum disorders in the general education population is a subject of current debate among educators and researchers.
+
+Medications may be used to treat ASD symptoms that interfere with integrating a child into home or school when behavioral treatment fails. They may also be used for associated health problems, such as ADHD or anxiety. More than half of US children diagnosed with ASD are prescribed psychoactive drugs or anticonvulsants, with the most common drug classes being antidepressants, stimulants, and antipsychotics. The atypical antipsychotic drugs risperidone and aripiprazole are FDA-approved for treating associated aggressive and self-injurious behaviors. However, their side effects must be weighed against their potential benefits, and people with autism may respond atypically. Side effects, for example, may include weight gain, tiredness, drooling, and aggression. SSRI antidepressants, such as fluoxetine and fluvoxamine, have been shown to be effective in reducing repetitive and ritualistic behaviors, while the stimulant medication methylphenidate is beneficial for some children with co-morbid inattentiveness or hyperactivity. There is scant reliable research about the effectiveness or safety of drug treatments for adolescents and adults with ASD. No known medication relieves autism's core symptoms of social and communication impairments. Experiments in mice have reversed or reduced some symptoms related to autism by replacing or modulating gene function, suggesting the possibility of targeting therapies to specific rare mutations known to cause autism.
+
+Although many alternative therapies and interventions are available, few are supported by scientific studies. Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance. Some alternative treatments may place the child at risk. The preference that children with autism have for unconventional foods can lead to reduction in bone cortical thickness with this being greater in those on casein-free diets, as a consequence of the low intake of calcium and vitamin D; however, suboptimal bone development in ASD has also been associated with lack of exercise and gastrointestinal disorders. In 2005, botched chelation therapy killed a five-year-old child with autism. Chelation is not recommended for people with ASD since the associated risks outweigh any potential benefits. Another alternative medicine practice with no evidence is CEASE therapy, a mixture of homeopathy, supplements, and 'vaccine detoxing'.
+
+Although popularly used as an alternative treatment for people with autism, as of 2018 there is no good evidence to recommend a gluten- and casein-free diet as a standard treatment. A 2018 review concluded that it may be a therapeutic option for specific groups of children with autism, such as those with known food intolerances or allergies, or with food intolerance markers. The authors analyzed the prospective trials conducted to date that studied the efficacy of the gluten- and casein-free diet in children with ASD (4 in total). All of them compared gluten- and casein-free diet versus normal diet with a control group (2 double-blind randomized controlled trials, 1 double-blind crossover trial, 1 single-blind trial). In two of the studies, whose duration was 12 and 24 months, a significant improvement in ASD symptoms (efficacy rate 50%) was identified. In the other two studies, whose duration was 3 months, no significant effect was observed. The authors concluded that a longer duration of the diet may be necessary to achieve the improvement of the ASD symptoms. Other problems documented in the trials carried out include transgressions of the diet, small sample size, the heterogeneity of the participants and the possibility of a placebo effect.
+
+In the subset of people who have gluten sensitivity there is limited evidence that suggests that a gluten-free diet may improve some autistic behaviors.
+
+There is tentative evidence that music therapy may improve social interactions, verbal communication, and non-verbal communication skills. There has been early research looking at hyperbaric treatments in children with autism. Studies on pet therapy have shown positive effects.
+
+There is no known cure. The degree of symptoms can decrease, occasionally to the extent that people lose their diagnosis of ASD; this occurs sometimes after intensive treatment and sometimes not. It is not known how often recovery happens; reported rates in unselected samples have ranged from 3% to 25%. Most children with autism acquire language by age five or younger, though a few have developed communication skills in later years. Many children with autism lack social support, future employment opportunities or self-determination. Although core difficulties tend to persist, symptoms often become less severe with age.
+
+Few high-quality studies address long-term prognosis. Some adults show modest improvement in communication skills, but a few decline; no study has focused on autism after midlife. Acquiring language before age six, having an IQ above 50, and having a marketable skill all predict better outcomes; independent living is unlikely with severe autism.
+
+Many individuals with autism face significant obstacles in transitioning to adulthood. Compared to the general population individuals with autism are more likely to be unemployed and to have never had a job. About half of people in their 20s with autism are not employed.
+
+Most recent reviews tend to estimate a prevalence of 1–2 per 1,000 for autism and close to 6 per 1,000 for ASD as of 2007. A 2016 survey in the United States reported a rate of 25 per 1,000 children for ASD. Globally, autism affects an estimated 24.8 million people , while Asperger syndrome affects a further 37.2 million. In 2012, the NHS estimated that the overall prevalence of autism among adults aged 18 years and over in the UK was 1.1%. Rates of PDD-NOS's has been estimated at 3.7 per 1,000, Asperger syndrome at roughly 0.6 per 1,000, and childhood disintegrative disorder at 0.02 per 1,000. CDC estimates about 1 out of 59 (1.7%) for 2014, an increase from 1 out of every 68 children (1.5%) for 2010.
+
+The number of reported cases of autism increased dramatically in the 1990s and early 2000s. This increase is largely attributable to changes in diagnostic practices, referral patterns, availability of services, age at diagnosis, and public awareness, though unidentified environmental risk factors cannot be ruled out. The available evidence does not rule out the possibility that autism's true prevalence has increased; a real increase would suggest directing more attention and funding toward changing environmental factors instead of continuing to focus on genetics.
+
+Boys are at higher risk for ASD than girls. The sex ratio averages 4.3:1 and is greatly modified by cognitive impairment: it may be close to 2:1 with intellectual disability and more than 5.5:1 without. Several theories about the higher prevalence in males have been investigated, but the cause of the difference is unconfirmed; one theory is that females are underdiagnosed.
+
+Although the evidence does not implicate any single pregnancy-related risk factor as a cause of autism, the risk of autism is associated with advanced age in either parent, and with diabetes, bleeding, and use of psychiatric drugs in the mother during pregnancy. The risk is greater with older fathers than with older mothers; two potential explanations are the known increase in mutation burden in older sperm, and the hypothesis that men marry later if they carry genetic liability and show some signs of autism. Most professionals believe that race, ethnicity, and socioeconomic background do not affect the occurrence of autism.
+
+Several other conditions are common in children with autism. They include:
+
+A few examples of autistic symptoms and treatments were described long before autism was named. The "Table Talk" of Martin Luther, compiled by his notetaker, Mathesius, contains the story of a 12-year-old boy who may have been severely autistic. Luther reportedly thought the boy was a soulless mass of flesh possessed by the devil, and suggested that he be suffocated, although a later critic has cast doubt on the veracity of this report. The earliest well-documented case of autism is that of Hugh Blair of Borgue, as detailed in a 1747 court case in which his brother successfully petitioned to annul Blair's marriage to gain Blair's inheritance. The Wild Boy of Aveyron, a feral child caught in 1798, showed several signs of autism; the medical student Jean Itard treated him with a behavioral program designed to help him form social attachments and to induce speech via imitation.
+
+The New Latin word "autismus" (English translation "autism") was coined by the Swiss psychiatrist Eugen Bleuler in 1910 as he was defining symptoms of schizophrenia. He derived it from the Greek word "autós" (αὐτός, meaning "self"), and used it to mean morbid self-admiration, referring to "autistic withdrawal of the patient to his fantasies, against which any influence from outside becomes an intolerable disturbance". A Soviet child psychiatrist, Grunya Sukhareva, described a similar syndrome that was published in Russian in 1925, and in German in 1926.
+
+The word "autism" first took its modern sense in 1938 when Hans Asperger of the Vienna University Hospital adopted Bleuler's terminology "autistic psychopaths" in a lecture in German about child psychology. Asperger was investigating an ASD now known as Asperger syndrome, though for various reasons it was not widely recognized as a separate diagnosis until 1981. Leo Kanner of the Johns Hopkins Hospital first used "autism" in its modern sense in English when he introduced the label "early infantile autism" in a 1943 report of 11 children with striking behavioral similarities. Almost all the characteristics described in Kanner's first paper on the subject, notably "autistic aloneness" and "insistence on sameness", are still regarded as typical of the autistic spectrum of disorders. It is not known whether Kanner derived the term independently of Asperger.
+
+Donald Triplett was the first person diagnosed with autism. He was diagnosed by Kanner after being first examined in 1938, and was labeled as "case 1". Triplett was noted for his savant abilities, particularly being able to name musical notes played on a piano and to mentally multiply numbers. His father, Oliver, described him as socially withdrawn but interested in number patterns, music notes, letters of the alphabet, and U.S. president pictures. By the age of 2, he had the ability to recite the 23rd Psalm and memorized 25 questions and answers from the Presbyterian catechism. He was also interested in creating musical chords.
+
+Kanner's reuse of "autism" led to decades of confused terminology like "infantile schizophrenia", and child psychiatry's focus on maternal deprivation led to misconceptions of autism as an infant's response to "refrigerator mothers". Starting in the late 1960s autism was established as a separate syndrome.
+
+As late as the mid-1970s there was little evidence of a genetic role in autism; while in 2007 it was believed to be one of the most heritable psychiatric conditions. Although the rise of parent organizations and the destigmatization of childhood ASD have affected how ASD is viewed, parents continue to feel social stigma in situations where their child's autistic behavior is perceived negatively, and many primary care physicians and medical specialists express some beliefs consistent with outdated autism research.
+
+It took until 1980 for the DSM-III to differentiate autism from childhood schizophrenia. In 1987, the DSM-III-R provided a checklist for diagnosing autism. In May 2013, the DSM-5 was released, updating the classification for pervasive developmental disorders. The grouping of disorders, including PDD-NOS, autism, Asperger syndrome, Rett syndrome, and CDD, has been removed and replaced with the general term of Autism Spectrum Disorders. The two categories that exist are impaired social communication and/or interaction, and restricted and/or repetitive behaviors.
+
+The Internet has helped autistic individuals bypass nonverbal cues and emotional sharing that they find difficult to deal with, and has given them a way to form online communities and work remotely. Societal and cultural aspects of autism have developed: some in the community seek a cure, while others believe that autism is simply another way of being.
+
+An autistic culture has emerged, accompanied by the autistic rights and neurodiversity movements. Events include World Autism Awareness Day, Autism Sunday, Autistic Pride Day, Autreat, and others. Organizations dedicated to promoting awareness of autism include Autistic Self Advocacy Network, Aspies For Freedom, Autism National Committee, and Autism Society of America. At the same time, some organizations, including Autism Speaks, have been condemned by disability rights organizations for failing to support autistic people. Social-science scholars study those with autism in hopes to learn more about "autism as a culture, transcultural comparisons... and research on social movements." While most autistic individuals do not have savant skills, many have been successful in their fields.
+
+The autism rights movement is a social movement within the context of disability rights that emphasizes the concept of neurodiversity, viewing the autism spectrum as a result of natural variations in the human brain rather than a disorder to be cured. The autism rights movement advocates for including greater acceptance of autistic behaviors; therapies that focus on coping skills rather than on imitating the behaviors of those without autism, and the recognition of the autistic community as a minority group. Autism rights or neurodiversity advocates believe that the autism spectrum is genetic and should be accepted as a natural expression of the human genome. This perspective is distinct from two other likewise distinct views: the medical perspective, that autism is caused by a genetic defect and should be addressed by targeting the autism gene(s), and fringe theories that autism is caused by environmental factors such as vaccines. A common criticism against autistic activists is that the majority of them are "high-functioning" or have Asperger syndrome and do not represent the views of "low-functioning" autistic people. 
+
+About half of autistics are unemployed, and one third of those with graduate degrees may be unemployed. Among autistics who find work, most are employed in sheltered settings working for wages below the national minimum. While employers state hiring concerns about productivity and supervision, experienced employers of autistics give positive reports of above average memory and detail orientation as well as a high regard for rules and procedure in autistic employees. A majority of the economic burden of autism is caused by decreased earnings in the job market. Some studies also find decreased earning among parents who care for autistic children.
+
+
+</doc>
\ No newline at end of file
diff --git a/fixtures/tests_samples/wmt16/sample.json b/fixtures/tests_samples/wmt16/sample.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c0e47b0648a2817d3f08d498f011e98d31f8e46
--- /dev/null
+++ b/fixtures/tests_samples/wmt16/sample.json
@@ -0,0 +1,10 @@
+{"translation": {"en": "Membership of Parliament: see Minutes", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal"}}
+{"translation": {"en": "Approval of Minutes of previous sitting: see Minutes", "ro": "Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal"}}
+{"translation": {"en": "Membership of Parliament: see Minutes", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal"}}
+{"translation": {"en": "Verification of credentials: see Minutes", "ro": "Verificarea prerogativelor: a se vedea procesul-verbal"}}
+{"translation": {"en": "Documents received: see Minutes", "ro": "Depunere de documente: a se vedea procesul-verbal"}}
+{"translation": {"en": "Written statements and oral questions (tabling): see Minutes", "ro": "Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal"}}
+{"translation": {"en": "Petitions: see Minutes", "ro": "Petiţii: a se vedea procesul-verbal"}}
+{"translation": {"en": "Texts of agreements forwarded by the Council: see Minutes", "ro": "Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal"}}
+{"translation": {"en": "Action taken on Parliament's resolutions: see Minutes", "ro": "Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal"}}
+{"translation": {"en": "Agenda for next sitting: see Minutes", "ro": "Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal"}}
diff --git a/fixtures/tests_samples/wmt_en_ro/test.json b/fixtures/tests_samples/wmt_en_ro/test.json
new file mode 100644
index 0000000000000000000000000000000000000000..2841b1b6aab9ed5ef54bfa4d60c82e9c1b676a09
--- /dev/null
+++ b/fixtures/tests_samples/wmt_en_ro/test.json
@@ -0,0 +1,20 @@
+{ "translation": { "en": "UN Chief Says There Is No Military Solution in Syria Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that \"there is no military solution\" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people. The U.N. chief again urged all parties, including the divided U.N. Security Council, to unite and support inclusive negotiations to find a political solution. Ban told a news conference Wednesday that he plans to meet with foreign ministers of the five permanent council nations - the U.S., Russia, China, Britain and France - on the sidelines of the General Assembly's ministerial session later this month to discuss Syria.", "ro": "Șeful ONU declară că nu există soluții militare în Siria Secretarul General Ban Ki-moon afirmă că răspunsul său la suportul militar al Rusiei pentru Siria este că „nu există o soluție militară” la conflictul care durează de aproape cinci ani iar mai multe arme nu ar face decât să agraveze violența și suferința a milioane de oameni. Șeful ONU a solicitat din nou tuturor părților, inclusiv Consiliului de securitate ONU divizat să se unifice și să susțină negocierile pentru a găsi o soluție politică. Ban a declarat miercuri în cadrul unei conferințe că intenționează să se întâlnească luna aceasta cu miniștrii de externe din cinci țări permanent prezente în consiliu - SUA, Rusia, China, Anglia și Franța - pe marginea sesiunii ministeriale a Adunării Generale pentru a discuta despre Siria." } }
+{ "translation": { "en": "He expressed regret that divisions in the council and among the Syrian people and regional powers \"made this situation unsolvable.\" Ban urged the five permanent members to show the solidarity and unity they did in achieving an Iran nuclear deal in addressing the Syria crisis. 8 Poll Numbers That Show Donald Trump Is For Real Some have tried to label him a flip-flopper. Others have dismissed him as a joke. And some are holding out for an implosion. But no matter how some Republicans are trying to drag Donald Trump down from atop the polls, it hasn't worked (yet).", "ro": "Ban și-a exprimat regretul că divizările în consiliu și între poporul sirian și puterile regionale „au făcut această situație de nerezolvat”. Ban le-a cerut celor cinci membri permanenți să dea dovadă de solidaritatea și unitatea arătate atunci când au reușit să încheie un acord referitor la armele nucleare ale Iranului, abordând astfel criza din Siria. 8 cifre din sondaje care arată că Donald Trump are șanse reale Unii au încercat să îl eticheteze ca politician „flip-flop”. Alții l-au numit o glumă. Iar alții așteaptă implozia. Însă indiferent de modul în care unii republicani încearcă să îl dărâme pe Donald Trump din vârful sondajelor, nu a funcționat (încă)." } }
+{ "translation": { "en": "Ten of the last 11 national polls have shown Donald Trump's lead at double digits, and some are starting to ask seriously what it means for the real estate mogul's nomination chances. Of course, it's still early in the election cycle. None of this is to say that Trump is likely to win the Republican nomination. Pundits point out that at this time in 2011, Rick Perry's lead was giving way to a rising Herman Cain, neither of whom won even one state in the nomination process. And there are many reasons he would struggle in a general election. But outside groups like Jeb Bush's Super PAC and the economic conservative group Club for Growth are recognizing Trump's staying power and beginning to unload their dollars to topple him.", "ro": "Zece din ultimele 11 sondaje naționale au arătat că Donald Trump conduce cu un procent din două cifre iar unele voci încep să se întrebe serios ce înseamnă acest lucru pentru șansele de numire ale mogulului imobiliar. Desigur, este încă prematur. Nimic din toate acestea nu spune că Trump va câștiga cursa pentru nominalizarea republicanilor. Pundits arată că, în aceeași perioadă a anului 2011, avansul lui Rick Perry îi făcea loc lui Herman Cain în sondaje, dar niciunul dintre ei nu a câștigat în vreun stat în cursa de nominalizare. Iar motivele pentru care s-ar lupta din greu la alegerile generale sunt numeroase. Însă grupurile din exterior precum Super PAC al lui Jeb Bush și grupul conservator economic Club for Growth admit puterea lui Trump și încep să îl susțină cu bani." } }
+{ "translation": { "en": "Here are some recent poll numbers that suggest that the real estate mogul isn't just a passing phase: Trump's favorability ratings have turned 180 degrees. Right before Donald Trump announced his candidacy in mid-June, a Monmouth University poll showed only two in 10 Republicans had a positive view of the real estate mogul. By mid-July, it was 40 percent. In early August, it was 52 percent. Now, six in 10 Republicans have a favorable view of Donald Trump. Roughly three in 10 say they have a negative view. And these numbers hold up in early states. A Quinnipiac poll in Iowa last week found that 60 percent of Republicans there had a favorable view of Trump.", "ro": "În continuare vă prezentăm câteva cifre din sondaje recente care sugerează că mogulul imobiliar nu este doar ceva trecător: Cifrele care indică susținerea față de Trump s-au întors la 180 grade. Chiar înainte ca Donald Trump să își anunțe candidatura, la mijlocul lui iunie, un sondaj realizat de Universitatea din Monmouth arăta că doar doi din 10 republicani aveau o părere pozitivă despre mogulul imobiliar. Până la mijlocul lui iulie, procentul a urcat la 40%. La începutul lui august, era 52%. În prezent, șase din 10 republicani au o părere favorabilă despre Donald Trump. Aproximativ trei din 10 declară că au o părere negativă. Aceste cifre se mențin. Un sondaj realizat săptămâna trecută de Quinnipiac în Iowa a concluzionat că 60% dintre republicanii din regiune au o părere favorabilă despre Trump." } }
+{ "translation": { "en": "Two-thirds of GOP voters would be happy with Trump as the nominee. In a CNN/ORC poll last week, 67 percent of Republicans said they would be either \"enthusiastic\" or \"satisfied\" if Trump were the nominee. Only two in 10 say they would be \"upset\" if he were the nominee. Only Ben Carson generates roughly the same level of enthusiasm as Trump (43 percent say they would be \"enthusiastic\" vs. 40 percent who say the same of Trump). The next closest in enthusiasm? Marco Rubio with only 21 percent.", "ro": "Două treimi dintre alegătorii GOP ar fi fericiți dacă Trump ar câștiga cursa pentru nominalizare. Într-un sondaj realizat săptămâna trecută de CNN/ORC, 67% dintre republicani au declarat că ar fi „entuziasmați” sau „mulțumiți” dacă Trump ar câștiga cursa pentru nominalizare. Doar doi din 10 declară că ar fi „supărați” dacă Trump ar câștiga cursa pentru nominalizare. Doar Ben Carson generează aproximativ același nivel de entuziasm ca Trump (43% declară că ar fi „entuziasmați” față de 40% care declară același lucru despre Trump). Cel mai aproape în ceea ce privește entuziasmul? Marco Rubio, cu doar 21%." } }
+{ "translation": { "en": "On the flip side, 47 percent of Republican voters say they would be \"dissatisfied\" or \"upset\" if establishment favorite Jeb Bush becomes the nominee. A majority of Republicans don't see Trump's temperament as a problem. While Donald Trump has been widely criticized for his bombast and insults, 52 percent of leaned Republican voters nationwide think that the real estate mogul has the right temperament to be president, according to Monday's ABC News/Washington Post poll. The same number holds in the first-in-the-nation caucus state of Iowa, where the same 52 percent of Republicans think he has the personality to be commander in chief, according to Quinnipiac last week.", "ro": "De partea cealaltă, 47% dintre alegătorii republicani afirmă că ar fi „nemulțumiți” sau „supărați” dacă favoritul Jeb Bush câștigă cursa pentru nominalizare. Majoritatea republicanilor nu consideră temperamentul lui Trump o problemă. Deși Donald Trump a fost puternic criticat pentru insultele aduse și stilul său bombastic, 52% dintre alegătorii republicani la nivel național consideră că mogulul imobiliar are temperamentul potrivit pentru a fi președinte, conform sondajului realizat luni de ABC News/Washington Post. Regăsim aceleași cifre în statul Iowa, unde tot 52% dintre republicani cred că Trump are personalitatea potrivită pentru a fi conducător, conform sondajului realizat săptămâna trecută de Quinnipiac." } }
+{ "translation": { "en": "Still, 44 percent think he doesn't have the personality to serve effectively, and almost six in 10 independents say his temperament does not belong in the White House, according to ABC/Post. Republican voters are getting used to the idea. When they put on their pundit hats, Republican voters think Trump is for real. When asked who is most likely to win the GOP nomination, four in 10 said Trump was the best bet, according to a CNN/ORC poll out last week. That's a change from when four in 10 placed their money on Jeb Bush in late July. Full disclosure: GOP voters haven't had the clearest crystal ball in the past.", "ro": "Totuși, 44% sunt de părere că nu are personalitatea necesară pentru a acționa eficient și aproape șase din 10 independenți afirmă că temperamentul său nu are ce căuta la Casa Albă, conform ABC/Post. Alegătorii republicani se obișnuiesc cu ideea. Atunci când iau atitudinea de intelectuali, alegătorii republicani consideră că Trump este autentic. Conform unui sondaj realizat săptămâna trecută de CNN/ORC, la întrebarea cine are cele mai multe șanse să câștige cursa pentru nominalizare GOP, patru din 10 au declarat că Trump. Situația s-a schimbat față de finalul lui iulie, când patru din 10 ar fi pariat pe Jeb Bush. Informare completă: în trecut, alegătorii GOP nu au citit foarte bine viitorul." } }
+{ "translation": { "en": "At this time last cycle, four in 10 Republicans picked Rick Perry to win the nomination, vs. only 28 percent for eventual nominee Mitt Romney. Still, it shows that a plurality of GOP voters see Trump's campaign as plausible. Even if Republicans rallied around another candidate, Trump still beats almost everyone. Some pundits point out that the splintered field is likely contributing to Trump's lead, while anti-Trump support is be spread diffusely among more than a dozen other candidates. But a Monmouth University poll in early September shows that, in a hypothetical head-to-head matchup between Trump and most other Republican candidates, Trump almost always garners majority support.", "ro": "În aceeași perioadă a ultimelor alegeri, patru din 10 republicani l-au ales pe Rick Perry în cursa pentru nominalizare, față de doar 28% pentru Mitt Romney. Însă, aceste cifre arată că majoritatea alegătorilor GOP consideră plauzibilă campania lui Trump. Chiar dacă republicanii sau repliat spre un alt candidat. Trump încă se află în fruntea tuturor. Unele voci spun că situația divizată va contribui probabil la victoria lui Trump, în timp ce susținerea contra lui Trump se va împărți la mai mult de doisprezece candidați. Însă un sondaj derulat la începutul lui septembrie de Universitatea din Monmouth arată că, în situația ipotetică a unei colaborări între Trump și majoritatea celorlalți candidați republicani, aproape întotdeauna Trump va beneficia de susținerea majoritară." } }
+{ "translation": { "en": "He leads Carly Fiorina by 13 points, Marco Rubio by 14 points, Walker by 15 points, Jeb Bush by 19 points, and, finally, Rand Paul, John Kasich and Chris Christie by 33 points each. He's in a dead heat with Ted Cruz. The only candidate who beats him? Ben Carson would lead the businessman by a wide 19 points in a hypothetical head-to-head. A bare majority of Donald Trump's supporters say they've made up their minds. A new CBS/NYT poll out on Tuesday shows that just more than half of voters who support Trump say they have locked in their votes. Obviously, a lot can happen to change that, and no one can really say they would never change their mind.", "ro": "Trump se află la distanță de 13 puncte de Carly Fiorina, la 14 puncte de Marco Rubio, la 15 puncte de Walker, la 19 puncte de Jeb Bush și, în cele din urmă, la câte 33 de puncte față de Rand Paul, John Kasich și Chris Christie. Este aproape la egalitate cu Ted Cruz. Singurul candidat care îl învinge? Ben Carson l-ar învinge pe omul de afaceri cu 19 puncte într-o confruntare ipotetică de unu la unu. Majoritatea susținătorilor lui Donald Trump declară că s-au decis. Un nou sondaj realizat marți de CBS/NYT arată că peste jumătate dintre alegătorii care îl susțin pe Trump declară că nu își schimbă opțiunea de vot. Evident, se pot întâmpla multe în acest sens și nimeni nu poate spune că aceștia nu se vor răzgândi niciodată." } }
+{ "translation": { "en": "46 percent said they are leaving the door open to switching candidates. Still, Trump's strongest competition at the moment is from fellow outsider neurosurgeon Ben Carson, but voters who say they have made up their minds are twice as likely to go for Trump. Six in 10 Republicans say they agree with Trump on immigration. Even since Donald Trump called immigrants from Mexico \"rapists\" in his campaign announcement speech two months ago, immigration has been front and center in the 2016 conversation. Some are worried that Trump's bombast will drive crucial Hispanic voters away from the Republican Party and damage rebranding efforts.", "ro": "46% afirmă că lasă portița deschisă posibilității de a-și schimba opțiunea. Cu toate acestea, cel mai important adversar al lui Trump este în prezent neurochirurgul Ben Carson, însă este de două ori mai probabil ca alegătorii care declară că s-au decis să voteze cu Trump. Șase din 10 republicani afirmă că sunt de acord cu Trump în problema imigrării. De când Donald Trump i-a numit pe imigranții din Mexic „violatori” în discursul de deschidere a campaniei sale, în urmă cu două luni, imigrarea a fost subiectul central în campania pentru 2016. Unii sunt îngrijorați că stilul bombastic al lui Trump va duce la o scindare între alegătorii hispanici importanți și Partidul Republican și va prejudicia eforturile de rebranding." } }
+{ "translation": { "en": "But according to Monday's new ABC/Post poll, six in 10 Republicans say they agree with Trump on immigration issues. So as long as immigration remains in the spotlight, it seems Donald Trump will remain too. Frustration with government is climbing to new highs. Donald Trump and Ben Carson now account for roughly half of the support from Republican voters, largely due to their outsider status. Six in 10 Republicans in Monday's new ABC/Post poll say they want a political outsider over someone with government experience. And they are angry at Washington, too.", "ro": "Însă, conform sondajului realizat luni de ABC/Post, șase din 10 republicani afirmă că sunt de acord cu Trump în problema imigrării. Așa că, se pare că atâta timp cât problema imigrării rămâne în lumina reflectoarelor, la fel va rămâne și Doland Trump. Frustrarea față de autorități atinge noi culmi. Donald Trump și Ben Carson sunt acum susținuți de aproape jumătate dintre alegătorii republicani, în mare parte datorită statutului lor de outsideri. Conform sondajului realizat luni de ABC/Post, șase din 10 republicani afirmă că preferă un outsider politic în detrimentul cuiva cu experiență în guvernare. Oamenii sunt de asemenea supărați pe autoritățile de la Washington." } }
+{ "translation": { "en": "A Des Moines Register/Bloomberg poll in Iowa from two weeks ago shows that three in four Iowa Republicans are frustrated with Republicans in Congress, with 54 percent \"unsatisfied\" and 21 percent \"mad as hell.\" Jeremy Corbyn to make debut at Prime Minister's Questions Since his election, Mr Corbyn's debut at PMQs has been keenly awaited New Labour leader Jeremy Corbyn is to make his debut at Prime Minister's Questions later, taking on David Cameron for the first time.", "ro": "Un sondaj derulat în urmă cu două săptămâni în Iowa de către Des Moines Register/Bloomberg arată că trei din patru republicani din Iowa sunt frustrați de prestația republicanilor din COngres, 54% declarându-se „nemulțumiți” iar 21% „nervoși la culme”. Jeremy Corbyn își face debutul la Prime Minister's Questions Încă de la alegerea sa, debutul domnului Corbyn la PMQs a fost îndelung așteptat Noul lider al Partidului Laburist, Jeremy Corbyn, își va face mai târziu debutul la Prime Minister's Questions, confruntându-se pentru prima dată cu David Cameron." } }
+{ "translation": { "en": "Mr Corbyn will rise to ask the first of his six allotted questions shortly after midday, with his performance likely to be closely scrutinised by the media and Labour MPs. He has called for \"less theatre and more facts\" at the weekly showpiece. He has also said he could skip some sessions, leaving them to colleagues. The encounter will be the first parliamentary test of Mr Corbyn's leadership, coming after his appointment of a shadow cabinet and his speech to the TUC annual congress on Tuesday.", "ro": "Dl Corbyn va adresa primele dintre cele șase întrebări la care are dreptul la scurt timp după prânz; prestația sa va fi probabil analizată îndeaproape de mass-media și parlamentarii laburiști. În cadrul aparițiilor săptămânale, el a cerut „mai puțin teatru și mai multe fapte”. A declarat de asemenea că poate renunța la câteva participări și că le cedează colegilor săi. Confruntarea va fi primul test parlamentar al Dl Corbyn în poziție de lider, venind după ce a numit un „cabinet fantomă” și după discursul pe care l-a ținut marți la congresul anual TUC." } }
+{ "translation": { "en": "Meanwhile, the Labour leader's decision to stand in silence during the singing of the national anthem at a service on Tuesday to mark the 75th anniversary of the Battle of Britain has attracted criticism from a number of Tory MPs and is the focus of several front page stories in the newspapers. Mr Corbyn's decision not to sing the national anthem has attracted attention A spokesman for Mr Corbyn said he had \"stood in respectful silence\" and did recognise the \"heroism of the Royal Air Force in the Battle of Britain.\"", "ro": "Între timp, decizia liderului Partidului laburist de a păstra tăcerea la rostirea imnului național în cadrul unei slujbe ținute marți cu ocazia aniversării a 75 de ani de la Bătălia Angliei a atras critici din partea unor parlamentari conservatori și a ținut prima pagină a ziarelor. Decizia domnului Corbyn de a nu cânta imnul național a atras atenția Un purtător de cuvânt al Dl Corbyn a declarat că acesta „a păstrat tăcerea în mod respectuos” și a recunoscut „eroismul Forțelor aeriene britanice în Bătălia Angliei.”" } }
+{ "translation": { "en": "But a member of Mr Corbyn's shadow cabinet, Owen Smith, told BBC Two's Newsnight programme he would have advised the Labour leader to sing the national anthem \"irrespective\" of his belief that the monarchy should be abolished. Nearly a dozen shadow ministers have refused to serve in Mr Corbyn's top team, citing differences over the economy, defence and foreign affairs, while less than a sixth of the parliamentary party originally backed him as leader. BBC political correspondent Robin Brant says policy differences are also \"stacking up\" within Labour following Mr Corbyn's appointment over its position on the European Union and the government's cap on benefits.", "ro": "Însă un membru al cabinetului fantomă al Dl Corbyn, Owen Smith, a declarat pentru emisiunea Two's Newsnight transmisă de BBC că i-ar fi recomandat liderului laburist să cânte imnul național „indiferent” de credința sa că monarhia ar trebui abolită. În jur de doisprezece miniștri din cabinetul fantomă au refuzat să facă parte din echipa de frunte a Dl Corbyn, argumentând prin diferențe de opinie legate de economie, apărare și externe, în timp ce mai puțin de o șesime din partidul parlamentar l-a susținut ca lider. Corespondentul politic al BBC, Robin Brant, declară că diferențele de politică „se cumulează” în Partidul Laburist după numirea domnului Corbyn referitor la poziția sa față de Uniunea Europeană și limita de beneficii." } }
+{ "translation": { "en": "Mr Corbyn told the TUC conference Labour was putting forward amendments to remove the whole idea of a cap altogether. Hours later Mr Smith, the shadow work and pensions secretary, said the party was \"very clear\" that it was only opposing government plans to reduce the level of cap from £26,000 to £23,000. Mr Corbyn will be the fifth Labour leader that David Cameron has faced across the despatch box over the past decade since he became Tory leader. The Labour leader, who has promised a different approach to politics, says he has \"crowd sourced\" ideas for questions to ask Mr Cameron and has been given more than 30,000 suggestions.", "ro": "Dl Corbyn a declarat la conferința TUC că Partidul Laburist va aduce modificări prin care se va elimina integral ideea limitării. Câteva ore mai târziu, Dl Smith, Ministrul Muncii și Pensiilor, a declarat că partidul „este foarte clar” în opoziția exclusivă față de planurile guvernului de a reduce nivelul „cap” de la 26.000 lire la 23.000 lire. Dl Corbyn va fi al cincilea lider laburist cu care se confruntă David Cameron la tribună în ultimul deceniu, de când a preluat conducerea Partidului Conservator. Liderul laburist, care a promis o abordare diferită a politicii, spune că are idei „din surse externe” pentru întrebări pe care să i le adreseze Domnului Cameron și că a primit peste 30.000 de sugestii." } }
+{ "translation": { "en": "The Islington North MP has said PMQs is too confrontational and that he will refrain from both \"repartee\" and trading barbs, instead vowing to focus on serious issues such as poverty, inequality and the challenges facing young people. Mr Corbyn has said that Angela Eagle, the shadow business secretary, will deputise for him at PMQs when he does not attend - for instance when Mr Cameron is travelling abroad. He has also floated the idea of allowing other colleagues to take the floor on occasion, saying he had approached the Commons Speaker John Bercow to discuss the issue.", "ro": "Parlamentarul Islington North a afirmat că PMQs implică un nivel de confruntare prea înalt și că se va abține de la replici și atacuri, angajându-se să se concentreze în schimb pe probleme serioase precum sărăcia, inegalitatea și provocările cu care se confruntă tinerii. Dl Corbyn a declarat că Angela Eagle, Ministrul de finanțe, îi va ține locul la PMQs atunci când el nu poate participa - de exemplu atunci când Dl Cameron se deplasează în străinătate. A exprimat de asemenea ideea că va permite altor colegi să ia cuvântul ocazional, spunând că l-a abordat pe Președintele Camerei Deputaților, John Bercow, pentru a discuta acest aspect." } }
+{ "translation": { "en": "When he became leader in 2005, Mr Cameron said he wanted to move away from the \"Punch and Judy\" style of politics often associated with PMQs but admitted some years later that he had failed. Since it was first televised in 1990, PMQs has been seen as a key barometer of a leader's judgement, their command of the Commons and their standing among their fellow MPs although critics have argued it has become a caricature and is in need of far-reaching reforms. 'Shot in Joburg': Homeless youth trained as photographers Downtown Johannesburg is a tough place to be homeless.", "ro": "În 2005, când a preluat conducerea, Dl Cameron a declarat că dorește să renunțe la stilul politic „Punch and Judy” asociat adesea cu PMQs însă a recunoscut câțiva ani mai târziu că nu a reușit în demersul său. De la prima transmisie, în 1990, PMQs a fost considerată un barometru cheie al raționamentului unui lider, al modului în care acesta conduce Camera Deputaților și a poziției sale în rândul colegilor parlamentari, deși criticii afirmă a ca devenit o caricatură și că are nevoie de o reformare profundă. „Cadru în Joburg”: Tineri fără adăpost beneficiază de cursuri de fotografie Este dificil să fii un om fără adăpost în Johannesburg." } }
+{ "translation": { "en": "But one group of former street children have found a way to learn a skill and make a living. \"I was shot in Joburg\" is a non-profit studio that teaches homeless youngsters how to take photographs of their neighbourhood and make a profit from it. BBC News went to meet one of the project's first graduates. JD Sports boss says higher wages could hurt expansion JD Sports Executive Chairman Peter Cowgill says a higher minimum wage for UK workers could mean \"more spending power in the pockets of potential consumers.\" But that spending power is unlikely to outweigh the higher labour costs at his firm, he says.", "ro": "Însă un grup de oameni care au trăit pe străzi în copilărie au găsit un mod de a învăța o meserie și de a-și câștiga traiul. „I was shot în Joburg” este un studio non-profit care îi învață pe tinerii fără adăpost să facă fotografii ale zonelor în care trăiesc și să câștige bani din asta. BBC News s-a întâlnit cu unul dintre primii absolvenți ai proiectului. Șeful JD Sports spune că salariile mai mari ar putea dăuna extinderii Președintele JD Sports, Peter Cowgill, declară că o creștere a salariului minim în Marea Britanie ar putea însemna „o putere de cumpărare mai mare în buzunarele potențialilor consumatori.” Este însă puțin probabil ca respectiva putere de cumpărare să depășească costurile mai mari pentru forța de muncă în cadrul firmei, afirmă el." } }
+{ "translation": { "en": "The costs could hit JD Sports' expansion plans, he added, which could mean fewer extra jobs. Thanasi Kokkinakis backed by Tennis Australia president Steve Healy Thanasi Kokkinakis deserves kudos rather than criticism for his behaviour. Thanasi Kokkinakis has been the collateral damage in the recent storm around his friend Nick Kyrgios and deserves kudos rather than criticism for his own behaviour, according to Tennis Australia president Steve Healy.", "ro": "Costurile ar putea avea impact asupra planurilor de extindere ale JD Sports, a adăugat el, ceea ce ar putea însemna mai puține locuri de muncă noi. Thanasi Kokkinakis susținut de președintele Tennis Australia, Steve Healy Thanasi Kokkinakis ar merita să fie lăudat și nu criticat pentru comportamentul său. Thanasi Kokkinakis a fost victimă colaterală în „furtuna” creată în jurul prietenului său, Nick Kyrgios, iar comportamentul său merită mai degrabă cuvinte de laudă și nu critică, în opinia președintelui Tennis Australia, Steve Healy." } }
diff --git a/fixtures/tests_samples/wmt_en_ro/train.json b/fixtures/tests_samples/wmt_en_ro/train.json
new file mode 100644
index 0000000000000000000000000000000000000000..269d5156c23e5b1dbe51db6ec39618e48eefa17b
--- /dev/null
+++ b/fixtures/tests_samples/wmt_en_ro/train.json
@@ -0,0 +1,11 @@
+{ "translation": { "en": "Corrections to votes and voting intentions: see Minutes Assignment conferred on a Member: see Minutes Membership of committees and delegations: see Minutes Decisions concerning certain documents: see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes", "ro": "Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Misiune încredinţată unui deputat: consultaţi procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Membership of Parliament: see Minutes Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes Verification of credentials: see Minutes Documents received: see Minutes Written statements and oral questions (tabling): see Minutes Petitions: see Minutes Texts of agreements forwarded by the Council: see Minutes Action taken on Parliament's resolutions: see Minutes Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 7.45 p.m.)", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal Verificarea prerogativelor: a se vedea procesul-verbal Depunere de documente: a se vedea procesul-verbal Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal Petiţii: a se vedea procesul-verbal Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Se levanta la sesión a las 19.45 horas)" } }
+{ "translation": { "en": "Election of Vice-Presidents of the European Parliament (deadline for submitting nominations): see Minutes (The sitting was suspended at 12.40 p.m. and resumed at 3.00 p.m.) Election of Quaestors of the European Parliament (deadline for submitting nominations): see Minutes (The sitting was suspended at 3.25 p.m. and resumed at 6.00 p.m.) Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 6.15 p.m.) Opening of the sitting (The sitting was opened at 9.35 a.m.) Documents received: see Minutes Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes", "ro": "Alegerea vicepreşedinţilor Parlamentului European (termenul de depunere a candidaturilor): consultaţi procesul-verbal (Die Sitzung wird um 12.40 Uhr unterbrochen und um 15.00 Uhr wiederaufgenommen). Alegerea chestorilor Parlamentului European (termenul de depunere a candidaturilor): consultaţi procesul-verbal (Die Sitzung wird um 15.25 Uhr unterbrochen und um 18.00 Uhr wiederaufgenommen). Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Die Sitzung wird um 18.15 Uhr geschlossen.) Deschiderea şedinţei (Die Sitzung wird um 9.35 Uhr eröffnet.) Depunerea documentelor: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Membership of committees (deadline for tabling amendments): see Minutes (The sitting was suspended at 7 p.m. and resumed at 9 p.m.) Agenda for next sitting: see Minutes Closure of sitting (The sitting was suspended at 23.25 p.m.) Documents received: see Minutes Communication of Council common positions: see Minutes (The sitting was suspended at 11.35 a.m. and resumed for voting time at noon) Approval of Minutes of previous sitting: see Minutes Committee of Inquiry into the crisis of the Equitable Life Assurance Society (extension of mandate): see Minutes", "ro": "Componenţa comisiilor (termenul de depunere a amendamentelor): consultaţi procesul-verbal (La seduta, sospesa alle 19.00, è ripresa alle 21.00) Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Die Sitzung wird um 23.25 Uhr geschlossen.) Depunerea documentelor: a se vedea procesul-verbal Comunicarea poziţiilor comune ale Parlamentului: a se vedea procesul-verbal (La séance, suspendue à 11h35 dans l'attente de l'Heure des votes, est reprise à midi) Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Comisia de anchetă privind criza societăţii de asigurări \"Equitable Life” (prelungirea mandatului): consultaţi procesul-verbal" } }
+{ "translation": { "en": "Announcement by the President: see Minutes 1. Membership of committees (vote) 2. Amendment of the ACP-EC Partnership Agreement (vote) 4. Certification of train drivers operating locomotives and trains on the railway system in the Community (vote) 6. Law applicable to non-contractual obligations (\"ROME II\") (vote) 8. Seventh and eighth annual reports on arms exports (vote) Corrections to votes and voting intentions: see Minutes Membership of committees and delegations: see Minutes Request for waiver of parliamentary immunity: see Minutes Decisions concerning certain documents: see Minutes", "ro": "Comunicarea Preşedintelui: consultaţi procesul-verbal 1. Componenţa comisiilor (vot) 2. Modificarea Acordului de parteneriat ACP-CE (\"Acordul de la Cotonou”) (vot) 4. Certificarea mecanicilor de locomotivă care conduc locomotive şi trenuri în sistemul feroviar comunitar (vot) 6. Legea aplicabilă obligaţiilor necontractuale (\"Roma II”) (vot) 8. Al şaptelea şi al optulea raport anual privind exportul de armament (vot) Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Cerere de ridicare a imunităţii parlamentare: consultaţi procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Written statements for entry", "ro": "Declaraţii scrise înscrise" } }
+{ "translation": { "en": "Written statements for entry in the register (Rule 116): see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes Adjournment of the session I declare the session of the European Parliament adjourned. (The sitting was closed at 1 p.m.) Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes Request for the defence of parliamentary immunity: see Minutes Appointments to committees (proposal by the Conference of Presidents): see Minutes Documents received: see Minutes Texts of agreements forwarded by the Council: see Minutes", "ro": "Declaraţii scrise înscrise în registru (articolul 116 din Regulamentul de procedură): a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal Întreruperea sesiunii Dichiaro interrotta la sessione del Parlamento europeo. (La seduta è tolta alle 13.00) Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal Cerere de apărare a imunităţii parlamentare: consultaţi procesul-verbal Numiri în comisii (propunerea Conferinţei preşedinţilor): consultaţi procesul-verbal Depunerea documentelor: a se vedea procesul-verbal Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Action taken on Parliament's resolutions: see Minutes Oral questions and written statements (tabling): see Minutes Written statements (Rule 116): see Minutes Agenda: see Minutes 1. Appointments to parliamentary committees (vote): see Minutes Voting time Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 12 midnight) Opening of the sitting (The sitting was opened at 09.05) Documents received: see Minutes Approval of Minutes of previous sitting: see Minutes 1. Protection of passengers against displaced luggage (vote) 2.", "ro": "Continuări ale rezoluţiilor Parlamentului: consultaţi procesul-verbal Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal Declaraţii scrise (articolul 116 din Regulamentul de procedură) Ordinea de zi: a se vedea procesul-verbal 1. Numiri în comisiile parlamentare (vot): consultaţi procesul-verbal Timpul afectat votului Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (La seduta è tolta alle 24.00) Deschiderea şedinţei (The sitting was opened at 09.05) Depunerea documentelor: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal 1. Protecţia pasagerilor împotriva deplasării bagajelor (vot) 2." } }
+{ "translation": { "en": "Approval of motor vehicles with regard to the forward field of vision of the driver (vote) 3. EC-Korea Agreement on scientific and technological cooperation (vote) 4. Mainstreaming sustainability in development cooperation policies (vote) 5. Draft Amending Budget No 1/2007 (vote) 7. EC-Gabon Fisheries Partnership (vote) 10. Limitation periods in cross-border disputes involving personal injuries and fatal accidents (vote) 12. Strategy for a strengthened partnership with the Pacific Islands (vote) 13. The European private company statute (vote) That concludes the vote.", "ro": "Omologarea vehiculelor cu motor cu privire la câmpul de vizibilitate înainte al conducătorului auto (vot) 3. Acordul CE-Coreea de cooperare ştiinţifică şi tehnologică (vot) 4. Integrarea durabilităţii în politicile de cooperare pentru dezvoltare (vot) 5. Proiect de buget rectificativ nr.1/2007 (vot) 7. Acordul de parteneriat în domeniul pescuitului între Comunitatea Europeană şi Republica Gaboneză (vot) 10. Termenele de prescripţie aplicabile în cadrul litigiilor transfrontaliere cu privire la vătămările corporale şi accidentele mortale (vot) 12. Relaţiile UE cu insulele din Pacific: Strategie pentru un parteneriat consolidat (vot) 13. Statutul societăţii private europene (vot) Damit ist die Abstimmungsstunde beendet." } }
+{ "translation": { "en": "Corrections to votes and voting intentions: see Minutes Assignment conferred on a Member: see Minutes Membership of committees and delegations: see Minutes Decisions concerning certain documents: see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes", "ro": "Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Misiune încredinţată unui deputat: consultaţi procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal" } }
+{ "translation": { "en": "Written statements for entry", "ro": "Declaraţii scrise înscrise" } }
diff --git a/fixtures/tests_samples/wmt_en_ro/val.json b/fixtures/tests_samples/wmt_en_ro/val.json
new file mode 100644
index 0000000000000000000000000000000000000000..22cdd68ecd1c5bd0018bbe04d756f4c10bd3b919
--- /dev/null
+++ b/fixtures/tests_samples/wmt_en_ro/val.json
@@ -0,0 +1,16 @@
+{ "translation": { "en": "Brazil's Former Presidential Chief-of-Staff to Stand Trial A federal judge on Tuesday accepted the charges filed against Brazil's former presidential chief of staff for his alleged involvement in a massive corruption scheme at state-owned oil company Petrobras. The federal prosecutor's office said Jose Dirceu will face trial on the corruption, racketeering and money laundering charges filed earlier this month. Fourteen other people will also be tried, including Joao Vaccari Neto, the former treasurer of Brazil's governing Workers' Party and Renato de Souza Duque, Petrobras' former head of corporate services.", "ro": "Fostul șef al cabinetului prezidențial brazilian este adus în fața instanței Marți, un judecător federal a acceptat acuzațiile aduse împotriva fostului șef al cabinetului prezidențial brazilian pentru presupusa implicare a acestuia într-o schemă masivă de corupție privind compania petrolieră de stat Petrobras. Biroul procurorului federal a declarat că Jose Dirceu va fi trimis în judecată pentru acuzațiile de corupție, înșelătorie și spălare de bani aduse în această lună. Alte paisprezece persoane vor fi judecate, printre acestea numărându-se Joao Vaccari Neto, fostul trezorier al Partidului Muncitorilor, aflat la putere în Brazilia, și Renato de Souza Duque, fostul președinte al serviciilor pentru întreprinderi ale Petrobras." } }
+{ "translation": { "en": "Dirceu is the most senior member of the ruling Workers' Party to be taken into custody in connection with the scheme. Dirceu served as former President Luiz Inacio Lula da Silva's chief of staff between 2003 and 2005. He was arrested early August in his home, where he already was under house arrest serving an 11-year sentence for his involvement in a cash-for-votes scheme in Congress more than 10 years ago. Prosecutors have said that Dirceu masterminded the kickback scheme at Petrobras, accepted bribes while in office and continued to receive payments from contractors after he was jailed in late 2013 for the vote-buying scandal.", "ro": "Dirceu este cel mai vechi membru al Partidului Muncitorilor aflat la guvernare luat în custodie pentru legăturile cu această schemă. Dirceu a servit ca șef de cabinet al fostului președinte Luiz Inacio Lula da Silva între 2003 și 2005. A fost arestat la începutul lui august de acasă, unde deja se afla sub arest la domiciliu, cu o pedeapsă de 11 ani pentru implicarea într-o schemă de cumpărare a voturilor în Congres cu peste 10 ani în urmă. Procurorii au declarat că Dirceu a dezvoltat schema de luare de mită de la Petrobras, a acceptat mită în timp ce se afla în funcție și a continuat să primească plăți de la antreprenori după ce a fost închis la sfârșitul lui 2013 pentru scandalul voturilor cumpărate." } }
+{ "translation": { "en": "According to prosecutors, the scheme at Petrobras involved roughly $2 billion in bribes and other illegal funds. Some of that money was allegedly funneled back to campaign coffers of the ruling party and its allies. It also allegedly included the payment of bribes to Petrobras executives in return for inflated contracts. 'Miraculous' recovery for Peshawar massacre schoolboy A teenager paralysed after being shot four times in Pakistan's deadliest terror attack has made a \"miraculous\" recovery following treatment in the UK. Muhammad Ibrahim Khan, 13, had been told by doctors in Pakistan that he would never walk again.", "ro": "Conform procurorilor, schema de la Petrobras a implicat aproximativ 2 miliarde de dolari sub formă de mită și alte fonduri ilegale. O parte din acei bani s-ar fi întors în fondul de campanie al partidului aflat la guvernare și al aliaților acestora. De asemenea, ar fi inclus mită către directorii Petrobras în schimbul unor contracte umflate. Recuperarea „miraculoasă” a unui elev supraviețuitor al masacrului de la Peshawar Un adolescent paralizat după ce fusese împușcat de patru ori în cel mai cumplit atac terorist din Pakistan a reușit o recuperare „miraculoasă” după ce a urmat un tratament în Regatul Unit. Lui Mohamed Ibrahim Khan, în vârstă de 13 ani, doctorii din Pakistan îi spuseseră că nu va mai putea să meargă niciodată." } }
+{ "translation": { "en": "At least 140 people, mostly children, were killed when gunmen stormed Peshawar's Army Public School last December. Muhammad, who arrived in London last month for surgery, is being discharged from hospital later. Exactly nine months ago, on an ordinary Tuesday morning, Muhammad sat in his first aid class listening to his teachers intently. At the same time seven gunmen disguised in security uniforms were entering the Army Public School. They were strapped with explosives and had one simple mission in mind: Kill every man, woman and child they came across. \"I can't forget what happened that day,\" Muhammad says with a severe stare.", "ro": "Cel puțin 140 de persoane, majoritatea copii, au fost ucise când bărbați înarmați au atacat școala publică a armatei din Peshawar în luna decembrie a anului trecut. Mohamed, care a sosit la Londra luna trecută pentru operație, va fi externat mai târziu din spital. Exact cu nouă luni în urmă, într-o dimineață obișnuită de marți, Mohamed stătea la ora de primul ajutor și își asculta atent profesorii. Chiar atunci, șapte bărbați înarmați deghizați în uniformele agenților de pază intrau în școala publică a armatei. Purtau centuri cu explozivi și aveau de îndeplinit o misiune simplă: să îi ucidă pe toți bărbații, femeile și copiii care le ieșeau în cale. „Nu pot uita ce s-a întâmplat în acea zi”, spune Mohamed cu o privire aspră." } }
+{ "translation": { "en": "We were sitting in the auditorium, we were asking questions... and then we heard heavy gunfire outside. The terrorists moved inside and they started killing - our teacher was burned alive. Muhammad described pulling four other pupils out of the auditorium as the carnage unfolded. He said he then heard his friend, Hamza calling to him. He said, 'oh brother save me'. I held his hand. That's when I was shot in the back, and he was shot in the head. Most of the people killed in the attack were pupils Hamza died in Muhammad's arms. Muhammad recalled blacking out after that, and the next thing he knew he was in a hospital bed, paralysed from the waist down.", "ro": "Stăteam în amfiteatru, puneam întrebări... apoi am auzit focuri de armă afară. Teroriștii au intrat înăuntru și au început să ucidă. Profesorul nostru a fost ars de viu. Mohamed descrie cum a scos patru elevi din amfiteatru în timp ce se desfășura carnagiul. Apoi spune că și-a auzit prietenul, pe Hamza, strigându-l. Spunea „oh, frate, salvează-mă”. L-am ținut de mână. Atunci eu am fost împușcat în spate, iar el în cap. Cei mai mulți dintre cei uciși în atac erau elevi Hamza a murit în brațele lui Mohamed. Mohamed își amintește că imediat după asta a leșinat și că următorul lucru pe care l-a știut a fost că se afla pe un pat de spital, paralizat de la brâu în jos." } }
+{ "translation": { "en": "Doctors in Peshawar in northern Pakistan, and then Rawalpindi, close to the capital, told his family there was no treatment, and he would never walk again. \"Seeing him I felt like my soul had left my body,\" says Muhammad's father, Sher Khan Those nine months were the hardest in my life. But Mr Khan and his wife, Sherbano, refused to believe that their cricket-mad son would never be able to use his legs again. They campaigned, and appealed for help on Pakistani TV, gaining the support of high profile people such as cricketer turned politician Imran Khan.", "ro": "Doctorii din Peshawar din nordul Pakistanului, apoi cei din Rawalpindi, aproape de capitală, i-au spus familiei sale că nu exista tratament și că nu va mai putea merge niciodată. „Când l-am văzut, am simțit cum îmi iese sufletul”, spune Sher Khan, tatăl lui Mohamed. Acele nouă luni au fost cele mai grele din viața mea. Însă Khan și soția lui, Sherbano, au refuzat să creadă că fiul lor atât de pasionat de crichet nu-și va mai putea folosi vreodată picioarele. Au făcut o campanie și au cerut ajutor de la televiziunea pakistaneză, atrăgând sprijinul unor oameni faimoși precum Imran Khan, jucător de crichet devenit politician." } }
+{ "translation": { "en": "Finally, they were able to raise the funds to bring Muhammad to the UK and provide him with treatment at London's private Harley Street Clinic. Consultant neurosurgeon Irfan Malik described Muhammad as \"terrified\" when he first arrived at the hospital. \"He'd spent the last [few] months lying on a bed, unable to move side to side,\" says Mr Malik. He was weak, he had a pressure sore on his back. He wasn't in great shape. A vertebra at the base of Muhammad's spine was destroyed Muhammad was shot in his shoulder, his hip, and his back during the attack, damaging his lower spine - leading to paralysis.", "ro": "Într-un final, au reușit să strângă fonduri pentru a-l duce pe Mohamed în Regatul Unit și a-i oferi tratament la clinica privată Harley Street din Londra. Neurochirurgul consultant Irfan Malik l-a descris pe Mohamed drept „înspăimântat” când acesta a ajuns la spital. „Își petrecuse ultimele [câteva] luni zăcând în pat, fără să se poată mișca de pe o parte pe alta, spune Malik. Era slăbit, se pusese multă presiune pe spatele lui. Nu era într-o formă prea bună. O vertebră de la baza coloanei vertebrale a lui Mohamed fusese distrusă Mohamed fusese împușcat în umăr, în șold și în spate în timpul atacului, iar coloana vertebrală inferioară îi fusese distrusă, ducând la paralizie." } }
+{ "translation": { "en": "But during six hours of surgery, Mr Malik and his team were able to reattach nerve endings and reconstruct the damaged part of the spine. Even Mr Malik was surprised at what happened next. Exactly one week after the surgery Muhammad stood up and started taking steps and walking. We were not expecting to get that sort of excellent result. That was miraculous,\" he says. Less than two weeks after his operation, Muhammad is ready to leave hospital and start the long road to recovery. Muhammad has defied the odds and started to walk again He says he wants to build his strength and continue his education in the UK. But he says he is determined to return to Pakistan, join the army and help fight terrorism.", "ro": "Însă, în timpul unei operații care a durat șase ore, Malik și echipa lui au reușit să lege din nou terminațiile nervoase și să reconstruiască partea distrusă a coloanei. Chiar și Malik a fost surprins de ceea ce s-a întâmplat în continuare. Exact la o săptămână după operație, Mohamed s-a ridicat și a început să facă pași și să meargă. Nu ne așteptam la un rezultat atât de bun. A fost un miracol”, spune acesta. În mai puțin de două săptămâni de la operație, Mohamed este gata să părăsească spitalul și să înceapă procesul lung de recuperare. Mohamed a sfidat soarta și a început să meargă din nou Vrea să devină puternic și să își continue studiile în Regatul Unit. Însă este hotărât să revină în Pakistan, să se înroleze în armată și să lupte împotriva terorismului." } }
+{ "translation": { "en": "\"I feel like I have a second chance at life,\" he says as he shows off pictures he's drawn of guns scribbled out next to school books and pens Muhammad grows physically stronger every day but the psychological trauma he continues to endure is unimaginable. \"My anger is not diminishing\" he says. In my school little kids were killed. What was their crime? His mother, wiping a tear from her eye, caressed his head and said: \"I can see my son walking again.\" He'll be able to get on with his normal life. 'Super Voice' 4G service from Three offers better signal Three is making use of a lower frequency 4G spectrum that can travel more widely", "ro": "„Simt că am încă o șansă la viață” spune el, arătând imaginile cu arme desenate de el lângă manuale școlare și stilouri Fizic, Mohamed devine tot mai puternic în fiecare zi, însă trauma psihologică prin care trece și acum este de neimaginat. „Furia mea nu a scăzut”, mărturisește el. În școala mea au fost uciși copii mici. Ce crimă au comis ei? Mama lui își șterge o lacrimă, îl mângâie pe creștet și spune: „Îmi văd fiul mergând din nou”. Va putea să-și continue firesc viața. Serviciul 4G „Super Voice” de la Three oferă semnal mai bun Three folosește un spectru 4G cu o frecvență mai joasă, care poate acoperi o zonă mai extinsă" } }
+{ "translation": { "en": "Mobile phone provider Three has launched a UK service it says will improve reception inside buildings and in rural black spots. Its 4G Super Voice enables customers to make calls and send texts using a lower frequency spectrum. Other networks are looking into introducing the technology, known as Voice Over Long-Term Evolution (VoLTE). It currently works on only the Samsung Galaxy S5, but recent iPhone handsets will be added in the coming months. Three said up to 5.5 million customers would have access to the service by 2017.", "ro": "Furnizorul de telefonie mobilă Three a lansat în Regatul Unit un serviciu despre care spune că va îmbunătăți recepția în interiorul clădirilor și în zonele rurale fără semnal. Serviciul 4G Super Voice le permite clienților să efectueze apeluri și să trimită mesaje text folosind un spectru cu o frecvență mai joasă. Și alte rețele intenționează să introducă aceeași tehnologie, cunoscută ca „Voice Over Long-Term Evolution (VoLTE)”. Aceasta funcționează momentan doar cu Samsung Galaxy S5, însă telefoanele iPhone recente vor beneficia de ea în lunile următoare. Three menționează că până la 5,5 milioane de clienți vor avea acces la serviciu până în 2017." } }
+{ "translation": { "en": "Chief technology officer Bryn Jones said: \"By the end of the year, one million of our customers will have access to better indoor coverage and be able to use their phones in more places than ever before.\" Stars prepare for panto season Pantomime season is big business for theatres up and down the UK, with many getting ready for this year's season now. Some of the biggest names in showbusiness now take part in the yuletide theatre. Matthew Kelly and Hayley Mills will be appearing in Cinderella - one as an ugly sister, the other as fairy godmother. They reveal their panto secrets to BBC Breakfast. Steven Wilson: 'If I don't do anything, I feel this creeping guilt'", "ro": "Responsabilul șef pentru tehnologie, Bryn Jones a declarat: „Până la sfârșitul anului, un milion dintre clienții noștri vor avea acces la o acoperire mai bună în interior și își vor putea folosi telefoanele în mai multe locuri ca până acum”. Vedetele se pregătesc pentru stagiunea de pantomimă Stagiunea de pantomimă este foarte importantă pentru teatrele din tot Regatul Unit, multe dintre ele pregătindu-se acum pentru stagiunea din acest an. Acum, la teatrul de Crăciun participă unele dintre numele cele mai mari din showbusiness. Matthew Kelly și Hayley Mills vor apărea în Cenușăreasa - primul în rolul uneia dintre surorile rele, iar a doua în rolul zânei. Aceștia dezvăluie secretele pantomimei lor la BBC Breakfast. Steven Wilson: „Dacă nu fac nimic, mă simt vinovat”" } }
+{ "translation": { "en": "Steven Wilson was recently the big winner at the Progressive Music Awards Steven Wilson is often dubbed the hardest working musician in the world of progressive rock. The multi-talented musician won three prizes at this month's Progressive Music Awards in London, including album of the year for Hand. The Guardian's five-star review called it \"a smart, soulful and immersive work of art.\" Since the 1980s, Wilson has been the driving force in a number of musical projects, the best known of which is the rock band Porcupine Tree. Now, ahead of two sell-out shows at the Royal Albert Hall, Wilson is releasing a vinyl-only double LP, Transience, to showcase the \"more accessible\" side of his solo output.", "ro": "Steven Wilson a fost desemnat recent drept marele câștigător al Progressive Music Awards Steven Wilson a fost numit de multe ori drept cel mai muncitor muzician din lumea rockului progresiv. Talentatul muzician a câștigat trei premii la Progressive Music Awards, care a avut loc luna aceasta la Londra, printre care și premiul pentru cel mai bun album al anului pentru Hand. În recenzia sa de cinci stele, The Guardian a numit albumul „o operă de artă inteligentă, expresivă și captivantă”. Încă din anii 1980, Wilson este motorul mai multor proiecte muzicale, cel mai cunoscut dintre acestea fiind trupa de rock Porcupine Tree. Acum, înainte de două spectacole cu casa închisă la Royal Albert Hall, Wilson lansează un dublu LP doar în format vinil, Transience, pentru a arăta latura „mai accesibilă” a activității sale solo." } }
+{ "translation": { "en": "He tells the BBC about his love of vinyl, his busy schedule and explains how comic actor Matt Berry came to be his support act. What does vinyl mean to you? I grew up at the very tail end of the vinyl era, and at the time, I remember, we couldn't wait for CD to come along because vinyl was so frustrating. You would buy the record, take it home, and it would have a scratch, and you would have to take it back again. I love CDs, and for some kinds of music - classical for example - it is better than vinyl. But the problem with the CD and digital downloads is that there's nothing you can really cherish or treasure. Owning vinyl is like having a beautiful painting hanging in your living room.", "ro": "A povestit pentru BBC despre dragostea lui pentru viniluri și despre programul său încărcat și a explicat cum a ajuns actorul de comedie Matt Berry să îi deschidă spectacolele. Ce înseamnă vinil pentru tine? Am crescut chiar în perioada de sfârșit a erei vinilurilor și îmi amintesc că atunci abia așteptam apariția CD-ului, căci vinilul era atât de enervant. Cumpărai un disc, mergeai cu el acasă, avea o zgârietură și trebuia să îl aduci înapoi. Iubesc CD-urile, iar pentru anumite tipuri de muzică, de exemplu cea clasică, sunt mai bune decât vinilurile. Însă problema cu CD-urile și cu descărcările digitale este aceea că nu mai există nimic pe care să îl prețuiești cu adevărat. Să ai un vinil e ca și cum ai avea un tablou frumos agățat în sufragerie." } }
+{ "translation": { "en": "It's something you can hold, pore over the lyrics and immerse yourself in the art work. I thought it was just a nostalgic thing, but it can't be if kids too young to remember vinyl are enjoying that kind of experience. Do you have a piece of vinyl that you treasure? The truth is I got rid of 100% of my vinyl in the 90s. All the vinyl I have is re-bought. I started off from the perspective that I wanted to recreate the collection I had when I was 15, but it's gone beyond that. The first record which I persuaded my parents to buy for me was Electric Light Orchestra's Out of the Blue.", "ro": "E ceva ce poți ține în mână, în timp ce te lași absorbit de versuri și copleșit de actul artistic. Am crezut că e doar o chestie nostalgică, însă nu are cum să fie așa dacă unor puști prea tineri să-și amintească de viniluri le place acest gen de experiență. Ai vreun vinil la care ții în mod special? Recunosc că am scăpat de toate vinilurile în anii '90. Toate vinilurile pe care le am sunt cumpărate din nou. Am pornit de la ideea de a reface colecția pe care o aveam la 15 ani, însă am trecut de limita aceea. Primul disc pe care mi-am convins părinții să mi-l cumpere a fost Out of the Blue de la Electric Light Orchestra." } }
+{ "translation": { "en": "If I still had my original copy, it would have sentimental value, but, alas, it's in a charity shop somewhere. Steven Wilson hopes the album will be a doorway for potential new fans Why release your new compilation Transience on vinyl? It was originally conceived as an idea for Record Store Day, but we missed the boat on that. My record company had suggested I put together some of my shorter, more accessible songs. I got a bit obsessed by the idea to make something like \"an introduction to Steven Wilson,\" and I was committed to it being a vinyl-only release. Anyone who buys the vinyl does also get a high-resolution download.", "ro": "Dacă aș mai fi avut încă exemplarul inițial, acesta ar fi avut valoare sentimentală, însă, din păcate, se află pe undeva printr-un magazin de caritate. Steven Wilson speră că albumul va fi o poartă către posibili fani noi De ce ți-ai lansat noua compilație Transience pe vinil? Aceasta a fost concepută inițial ca idee pentru Ziua magazinelor de discuri, însă am ratat ocazia. Casa mea de discuri sugerase să adun câteva dintre melodiile mele mai scurte și mai accesibile. Am ajuns să fiu ușor obsedat de ideea de a face ceva gen „introducere în muzica lui Steven Wilson” și am ținut neapărat ca proiectul să fie lansat doar pe vinil. Cine cumpără vinilul primește, de asemenea, și o variantă descărcată la rezoluție înaltă." } }
+{ "translation": { "en": "Do you have a concern that the album won't show your work in a true light?", "ro": "Ești îngrijorat că albumul nu va arăta muzica ta în adevărata ei lumină?" } }
diff --git a/fixtures/tests_samples/xsum/sample.json b/fixtures/tests_samples/xsum/sample.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea6e8a8bb8f6705b20776a4e126b8822d6889f7e
--- /dev/null
+++ b/fixtures/tests_samples/xsum/sample.json
@@ -0,0 +1,10 @@
+{"document": "The warning begins at 22:00 GMT on Saturday and ends at 10:00 on Sunday.\nThe ice could lead to difficult driving conditions on untreated roads and slippery conditions on pavements, the weather service warned.\nOnly the southernmost counties and parts of the most westerly counties are expected to escape.\nCounties expected to be affected are Carmarthenshire, Powys, Ceredigion, Pembrokeshire, Denbighshire, Gwynedd, Wrexham, Conwy, Flintshire, Anglesey, Monmouthshire, Blaenau Gwent, Caerphilly, Merthyr Tydfil, Neath Port Talbot, Rhondda Cynon Taff and Torfaen.", "summary": "The Met Office has issued a yellow weather warning for ice across most of Wales."}
+{"document": "You can see highlights of Sunderland v Arsenal on Match of the Day at 22:20 BST on Saturday on BBC One and the BBC Sport website.\nStoke and West Ham, for example, have started to climb away from the relegation zone but the biggest worry for Sunderland fans is that their side do not look remotely capable of doing the same.\nI know the Black Cats have got out of trouble before having found themselves in a similar situation but this time, after picking up only two points from their first nine games, things look really desperate for the only top-flight team without a win.\nAt least one element of their struggles seems to be self-inflicted, with everyone at the club feeling sorry for themselves - and not just because they have lost some players to injury and conceded some costly late goals.\nThere is a negative feeling about the place with the manager David Moyes and his players talking about how they have gone backwards since last season, when they should be searching for any kind of spark that could change things around.\nFrom the outside, looking at the way they play and their lack of creativity, it is hard to see what that spark might be or what could fundamentally change under Moyes until the January transfer window opens.\nIf they can get one win under their belt then they will get a bit of belief back but, the longer this winless run goes on, the more negativity there will be.\nMedia playback is not supported on this device\nSunderland finished last season on a high under Sam Allardyce, with a run of just one defeat in their last 11 games securing their safety.\nIn the space of five months, all of that confidence and momentum seems to have been sucked out of the club, despite them effectively having the same group of players who, not so long ago, looked inspired.\nThat is not all down to Moyes, but he has to take some responsibility for it.\nI am yet to see a defined style of play from Sunderland since he took charge at the end of July.\nThat is in contrast to Allardyce's time as manager, when they were resolute and difficult to beat and, at the end of his stint at the Stadium of Light, also played with a purpose when they went forward.\nOff the pitch, Moyes has not helped himself much either.\nThere was no need for him to be so pessimistic when he came out after the second game of the season and announced they would be in a relegation fight, which did not send out the right message to his players or the fans.\nWhen he took charge, he had actually started out by being unrealistically positive - talking about Sunderland becoming a club that regularly finished in the top half of the Premier League - but his expectations went downhill very quickly.\nI know you can argue that he has been proved right, because Sunderland are now battling the drop, but it meant there was a cloud over from them almost as soon as the season had started.\nIt seems to be a case that if you stop Jermain Defoe, you stop Sunderland. His statistics stand up well in comparison to last season, but the rest of their team are not doing enough in attack.\nThey were reliant on Defoe last season too, but others did chip in - in their first nine league games of 2015-16, five players found the net. This time around, only Defoe and Patrick van Aanholt have scored in the same period.\nIt is going to be a massive struggle for them to stay up from the position they are now in anyway, but they badly need a win and quickly. I don't see it coming at home to Arsenal on Saturday, though.\nDo they even look capable of holding out for a draw against the Gunners, the way another struggling team Middlesbrough did at Emirates Stadium last weekend? No.\nIf you struggle to make chances and score goals, as Sunderland do, that puts more pressure on your defence because you know if you concede then you are in big trouble.\nAnd the Black Cats have problems at the back as well - their only clean sheet in 12 matches under Moyes was against League One side Shrewsbury Town in the EFL Cup.\nIt does not bode well against an Arsenal side that are averaging more than two goals a game this season.\nIt is hard to find any positives from Sunderland's situation but at least they have not been cut adrift at the bottom - yet.\nUnless they win soon, that could happen. I think Hull are also in for a very tough season but when I look at the other two teams immediately above them, Boro and Swansea, they definitely have more about them than the Black Cats do.\nMedia playback is not supported on this device\nChanging manager has clearly not helped Sunderland and comparisons with his predecessor do not help Moyes much either.\nYou cannot tell me that, if Allardyce was still in charge, Sunderland would have only picked up two points so far. It just would not have happened.\nMoyes replaced him relatively late in the summer, which is difficult in itself, but he can only complain about the things that have gone against him up to a point. He should be doing much better than he is.\nHe is still the manager and he is capable of turning things around, so it is right there is no suggestion of him getting the sack.\nBut that will not last forever. This industry is results-driven and Moyes' results are not good enough.\nThat clearly has to change soon and, looking at Sunderland's next few fixtures, the one that stands out as a must-win is their home game against Hull on 19 November.\nIf they fail to beat Arsenal and Bournemouth, then the visit of the Tigers will be the game to define Moyes' tenure.  If Sunderland are still without a win after that, things will become extremely difficult for him.\nChris Sutton was speaking to BBC Sport's Chris Bevan.", "summary": "We are exactly a quarter of the way through the Premier League season and some teams at the bottom of the table seem to be turning things around after making a bad start."}
+{"document": "The win keeps the Candystripes two points behind leaders Dundalk who won 2-0 away to Shamrock Rovers.\nFormer Plymouth striker Patterson scored his sixth goal of the season in the 14th minute at the Brandywell.\nHe shot into an empty net after the ball broke to him when keeper Dean Delany thwarted Barry McNamee.\nKurtis Byrne should have netted a speedy equaliser but the son of former Celtic player Paul Byrne completely missed his kick in front of goal.\nThat was the one big scare for Kenny Shiels' men on a night when both keepers had a quiet night.\nDerry City have won six and drawn two in the eight games they have played since losing to Finn Harps on the first day of the season.", "summary": "Rory Patterson's early goal proved enough to give second-placed Derry City a home victory over Bohemians in Friday night's Premier Division clash."}
+{"document": "The centre-right coalition led by Mr Passos Coelho won the most seats in the election on 4 October.\nBut Socialist leader Antonio Costa has been working to build a coalition with far-left parties.\nMany believe that Mr Passos Coelho will fail to pass the test of a vote of no confidence in Portugal's parliament.\nPresident Anibal Cavaco Silva would then be expected to ask the left to form a government.\nThere are fears that weeks of uncertainty could harm Portugal's economic recovery, more than a year after it exited the strict terms of its â‚¬78bn (Â£57bn) international bailout.\nEU officials have threatened to take action against Portugal for missing a 15 October deadline to present its draft 2016 budget.\nPortugal is still running one of the highest budget deficits in the eurozone.\n12%\nof the workforce is unemployed\n20%\nof people live below the poverty line\n485,000 emigrated from Portugal between 2011 and 2014\n125% debt to GDP - the second highest rate in the European Union\nMr Passos Coelho's Social Democrats have promised to present a budget, but the two left-wing parties campaigned strongly against his outgoing government's record of harsh austerity.\nThe Left Bloc is seen as allied to the anti-austerity Syriza party in Greece, which for months tried to renegotiate the terms of Greece's eurozone bailout.\nPortugal's Communist Party is regarded as anti-euro and anti-Nato, although it is thought to have moderated its eurozone policies in recent weeks.\nIf Mr Costa's Socialists are eventually chosen to lead a left-wing coalition, it would be the first time since the fall of Portugal's dictatorship in 1974 that a right-wing president appointed a government backed by communists.\nAfter his re-appointment as prime minister leading a right-of-centre coalition, Pedro Passos Coelho has 10 days to appoint ministers and secure parliamentary approval.\nThat may prove impossible, since his coalition lost its majority in the 4 October election and the Socialists have pledged to reject his programme if their talks with other parties succeed.\nTogether, the Socialists, Left Bloc and Communist Party have a majority. All wanted the president to appoint Mr Costa - arguing that anything else was a waste of time.\nIf Mr Passos Coelho does fail, the president could then appoint Mr Costa or keep the incumbent on as caretaker.\nFresh legislative elections may only take place from June, after voters have elected a new president early next year.", "summary": "The Portuguese president has invited incumbent Prime Minister Pedro Passos Coelho to form the next government, despite him having lost his majority."}
+{"document": "Nev Edwards scored an early try for Sale, before Castres' Florian Vialelle went over, but Julien Dumora's penalty put the hosts 10-7 ahead at the break.\nJoe Ford sent over a penalty before Castres' Marc-Antoine Rallier and Sales' Will Addison were sin-binned.\nJulien Caminati's late attempt to stop Charlie Ingall saw Sale awarded the decisive penalty try.\nThe win moves the English Premiership side to within one point of Pool Two leaders Newport Gwent Dragons after three games.\nSale got off to the ideal start, Edwards sprinting away for the game's opening points from an Andrei Ostrikov kick, but Castres heaped the pressure on in search of a reply, which came through Vialelle on eight minutes.\nSharks flanker Magnus Lund was forced off with a head injury before the television match official denied Castres a second try, with replays showing that the Sharks defence did enough to force full-back Caminati into touch.\nFord had a chance to put Sale ahead again, but his penalty on 27 minutes drifted wide. Dumora, however, made no mistake soon after, slotting over to give the French side the lead on 33 minutes.\nA combination of probing grubber kicks and scrappy play eventually led to Ford teeing up his second penalty attempt, with the fly-half this time booting the three points to make it 10-10.\nRallier's yellow card following a scuffle saw Ford opt for the posts soon after, but he was off target again before Sales' one-man advantage was lost as Addison was sin-binned.\nSharks pushed for the breakthrough as Ingall went close to touching down, and the video referee eventually gave the penalty try after deciding that Caminati's attempt to stop the winger was illegal.\nCastres: Caminati; Martial, Vialelle, Combezou, Decrop; Dumora, Dupont; Taumoepeau, Rallier, Montes; Samson, Moreaux, Caballero, Diarra, Beattie.\nReplacements: Beziat, Tichit, Martinez, Desroche, Babillot, Fontaine, Lamerat, Seron.\nSale: Arscott; Edwards, Addison, Jennings, Ingall; Ford, Mitchell, Lewis-Roberts, Briggs, Mujati, Mills, Ostrikov, Lund, Seymour (capt), Easter.\nReplacements: Taylor, Flynn, Parker, Beaumont, Neild, Jeffers, James, Haley.\nReferee: David Wilkinson (Ireland)", "summary": "A late penalty try gave Sale victory over Castres at Stade Pierre-Antoine in their European Challenge Cup clash."}
+{"document": "The 33-year-old was released by Norwich this summer after five years at the club, during which time he made 75 Canaries first-team appearances.\nTurner also had spells on loan at Fulham and Sheffield Wednesday during his time at Carrow Road.\nIn total, the centre-back has made 436 senior career appearances for eight different clubs.\nFind all the latest football transfers on our dedicated page.", "summary": "League One side Southend United have signed former Hull and Norwich defender Michael Turner on a one-year deal."}
+{"document": "United contacted St Johnstone this week with a view to speaking to 52-year-old Wright about the job but this approach was rejected by the Saints board.\nThe Tannadice club - bottom of the Premiership - are seeking to replace Jackie McNamara, who left last month.\nDave Bowman took the first team for Saturday's loss to Partick Thistle.\nThe Tangerines have won only once this season and prop up the table with five points from 10 games.\nFormer Northern Ireland goalkeeper Wright, who replaced Steve Lomas at McDiarmid Park in 2013, led St Johnstone to Scottish Cup success in his first season in charge.\nHe has also secured two successive top-six finishes for the Perth side and previously managed in his homeland.", "summary": "St Johnstone boss Tommy Wright is no longer under consideration for the Dundee United manager's job, BBC Scotland has learned."}
+{"document": "Media playback is unsupported on your device\n2 November 2014 Last updated at 17:20 GMT\nHomes and businesses were damaged in the storm, but weather experts were not able to confirm it was a tornado.\nNavtej Johal reports.", "summary": "Residents in Coalville in Leicestershire are cleaning up after high winds hit the town."}
+{"document": "5 August 2015 Last updated at 06:36 BST\nShe's now 84 and has been telling Newsround the inspiring story of her life before and after that devastating and world-changing event.\nThis animation contains some sad moments that you might find upsetting.\nYou can find out more about what happened in Hiroshima here.\nWatch 'Hiroshima: A Newsround Special' - Thursday 6 August at 5.30pm on the CBBC channel and on the Newsround website.", "summary": "Bun Hashizume was 14 years old and lived in Hiroshima, in Japan, when a nuclear bomb was dropped on the city 70 years ago, at the end of World War Two."}
+{"document": "But what has been your moment of the year?\nFrom Ben Stokes' 258 off 198 balls against South Africa to Stuart Broad's 6-17 against the same opponents, and Alastair Cook being the first Englishman to reach 10,000 Test runs, there are lots of highlights.\nOr perhaps you revelled in Australia being skittled for just 85? Or the dog that invaded the pitch at Vizag?\nThe cricket brains of BBC Sport and BBC Radio 5 live asked you to rank your top 10, and your shortlist will be revealed on Tuesday's Tuffers and Vaughan Cricket Show (20:30 GMT, BBC Radio 5 live and online).\nVotes will no longer count but you can still pick your top 10 and share with friends.\nWhat are your top 10 cricketing moments from this year?", "summary": "It's been topsy-turvy for the England side but eventful and entertaining nonetheless."}
diff --git a/pytorch_model.bin b/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38f6645c9a893663813fb621f0ea034c5c54e760
Binary files /dev/null and b/pytorch_model.bin differ
diff --git a/runs/Feb15_12-40-57_Beaver/1613410865.1810203/events.out.tfevents.1613410865.Beaver.416948.1 b/runs/Feb15_12-40-57_Beaver/1613410865.1810203/events.out.tfevents.1613410865.Beaver.416948.1
new file mode 100644
index 0000000000000000000000000000000000000000..eaad24c460f3586d52c5b9176e4a013f46a57dd5
Binary files /dev/null and b/runs/Feb15_12-40-57_Beaver/1613410865.1810203/events.out.tfevents.1613410865.Beaver.416948.1 differ
diff --git a/runs/Feb15_12-40-57_Beaver/events.out.tfevents.1613410865.Beaver.416948.0 b/runs/Feb15_12-40-57_Beaver/events.out.tfevents.1613410865.Beaver.416948.0
new file mode 100644
index 0000000000000000000000000000000000000000..496bb0b2a5df10f7ca77f03dc851014f793ef0a3
Binary files /dev/null and b/runs/Feb15_12-40-57_Beaver/events.out.tfevents.1613410865.Beaver.416948.0 differ
diff --git a/runs/Feb15_12-41-05_Beaver/1613410865.2578263/events.out.tfevents.1613410865.Beaver.416948.3 b/runs/Feb15_12-41-05_Beaver/1613410865.2578263/events.out.tfevents.1613410865.Beaver.416948.3
new file mode 100644
index 0000000000000000000000000000000000000000..fb731adbcb05b81ae50be10f7f55ea1c4145e600
Binary files /dev/null and b/runs/Feb15_12-41-05_Beaver/1613410865.2578263/events.out.tfevents.1613410865.Beaver.416948.3 differ
diff --git a/runs/Feb15_12-41-05_Beaver/1613410865.3054066/events.out.tfevents.1613410865.Beaver.416948.5 b/runs/Feb15_12-41-05_Beaver/1613410865.3054066/events.out.tfevents.1613410865.Beaver.416948.5
new file mode 100644
index 0000000000000000000000000000000000000000..3fa4eb5fe36342255a3d16b87eb35448fd4952af
Binary files /dev/null and b/runs/Feb15_12-41-05_Beaver/1613410865.3054066/events.out.tfevents.1613410865.Beaver.416948.5 differ
diff --git a/runs/Feb15_12-41-05_Beaver/1613410865.3519046/events.out.tfevents.1613410865.Beaver.416948.7 b/runs/Feb15_12-41-05_Beaver/1613410865.3519046/events.out.tfevents.1613410865.Beaver.416948.7
new file mode 100644
index 0000000000000000000000000000000000000000..03bdff5bc642e5c3ccaa686e3c4947b1e41a1c92
Binary files /dev/null and b/runs/Feb15_12-41-05_Beaver/1613410865.3519046/events.out.tfevents.1613410865.Beaver.416948.7 differ
diff --git a/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.2 b/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.2
new file mode 100644
index 0000000000000000000000000000000000000000..498ab727d10bb5391082277c17920f43c6de0462
Binary files /dev/null and b/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.2 differ
diff --git a/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.4 b/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.4
new file mode 100644
index 0000000000000000000000000000000000000000..ff3d4b4b554339fcde249df99ddae1347b2e5c16
Binary files /dev/null and b/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.4 differ
diff --git a/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.6 b/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.6
new file mode 100644
index 0000000000000000000000000000000000000000..7a5bf95548e47521c91ba9c5ad45b0a387895137
Binary files /dev/null and b/runs/Feb15_12-41-05_Beaver/events.out.tfevents.1613410865.Beaver.416948.6 differ
diff --git a/runs/Feb15_12-43-30_Beaver/1613411012.460237/events.out.tfevents.1613411012.Beaver.418028.1 b/runs/Feb15_12-43-30_Beaver/1613411012.460237/events.out.tfevents.1613411012.Beaver.418028.1
new file mode 100644
index 0000000000000000000000000000000000000000..86e91deba4f75edeb167e647b519bcaea548edd6
Binary files /dev/null and b/runs/Feb15_12-43-30_Beaver/1613411012.460237/events.out.tfevents.1613411012.Beaver.418028.1 differ
diff --git a/runs/Feb15_12-43-30_Beaver/events.out.tfevents.1613411012.Beaver.418028.0 b/runs/Feb15_12-43-30_Beaver/events.out.tfevents.1613411012.Beaver.418028.0
new file mode 100644
index 0000000000000000000000000000000000000000..7f7d8c7892f0ddbaf3b885f0b70405a8362d7ff7
Binary files /dev/null and b/runs/Feb15_12-43-30_Beaver/events.out.tfevents.1613411012.Beaver.418028.0 differ
diff --git a/runs/Feb15_12-43-32_Beaver/1613411012.5048187/events.out.tfevents.1613411012.Beaver.418028.3 b/runs/Feb15_12-43-32_Beaver/1613411012.5048187/events.out.tfevents.1613411012.Beaver.418028.3
new file mode 100644
index 0000000000000000000000000000000000000000..2ae28347b3461252da7bfcecf608367071c39b42
Binary files /dev/null and b/runs/Feb15_12-43-32_Beaver/1613411012.5048187/events.out.tfevents.1613411012.Beaver.418028.3 differ
diff --git a/runs/Feb15_12-43-32_Beaver/1613411012.5604634/events.out.tfevents.1613411012.Beaver.418028.5 b/runs/Feb15_12-43-32_Beaver/1613411012.5604634/events.out.tfevents.1613411012.Beaver.418028.5
new file mode 100644
index 0000000000000000000000000000000000000000..d5056a24f03ca89c1bfc69a94bf1d1166b87071d
Binary files /dev/null and b/runs/Feb15_12-43-32_Beaver/1613411012.5604634/events.out.tfevents.1613411012.Beaver.418028.5 differ
diff --git a/runs/Feb15_12-43-32_Beaver/1613411012.6075606/events.out.tfevents.1613411012.Beaver.418028.7 b/runs/Feb15_12-43-32_Beaver/1613411012.6075606/events.out.tfevents.1613411012.Beaver.418028.7
new file mode 100644
index 0000000000000000000000000000000000000000..c77b07c6bbac67bce8d652fdd948903fa3d5a9e5
Binary files /dev/null and b/runs/Feb15_12-43-32_Beaver/1613411012.6075606/events.out.tfevents.1613411012.Beaver.418028.7 differ
diff --git a/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.2 b/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.2
new file mode 100644
index 0000000000000000000000000000000000000000..573a771b0a78a8705323f33d7e910d52e22cd383
Binary files /dev/null and b/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.2 differ
diff --git a/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.4 b/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.4
new file mode 100644
index 0000000000000000000000000000000000000000..8aab2e14b3cca61271ec5e6719330b1309098bec
Binary files /dev/null and b/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.4 differ
diff --git a/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.6 b/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.6
new file mode 100644
index 0000000000000000000000000000000000000000..3dd138c21a83f4b2b1f66621de38c694afd4b083
Binary files /dev/null and b/runs/Feb15_12-43-32_Beaver/events.out.tfevents.1613411012.Beaver.418028.6 differ
diff --git a/runs/Feb15_12-43-55_Beaver/1613411037.134224/events.out.tfevents.1613411037.Beaver.418259.1 b/runs/Feb15_12-43-55_Beaver/1613411037.134224/events.out.tfevents.1613411037.Beaver.418259.1
new file mode 100644
index 0000000000000000000000000000000000000000..019c7abf96e29bc48fc190295ba6f7e0df2bdc2a
Binary files /dev/null and b/runs/Feb15_12-43-55_Beaver/1613411037.134224/events.out.tfevents.1613411037.Beaver.418259.1 differ
diff --git a/runs/Feb15_12-43-55_Beaver/events.out.tfevents.1613411037.Beaver.418259.0 b/runs/Feb15_12-43-55_Beaver/events.out.tfevents.1613411037.Beaver.418259.0
new file mode 100644
index 0000000000000000000000000000000000000000..9db19fae1d427f7afe56d2ed2081ce5e632fc93f
Binary files /dev/null and b/runs/Feb15_12-43-55_Beaver/events.out.tfevents.1613411037.Beaver.418259.0 differ
diff --git a/runs/Feb15_12-43-57_Beaver/1613411037.1697214/events.out.tfevents.1613411037.Beaver.418259.3 b/runs/Feb15_12-43-57_Beaver/1613411037.1697214/events.out.tfevents.1613411037.Beaver.418259.3
new file mode 100644
index 0000000000000000000000000000000000000000..fd503757f1aa5567f3e3c6984a069c332b74c606
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/1613411037.1697214/events.out.tfevents.1613411037.Beaver.418259.3 differ
diff --git a/runs/Feb15_12-43-57_Beaver/1613411037.2089725/events.out.tfevents.1613411037.Beaver.418259.5 b/runs/Feb15_12-43-57_Beaver/1613411037.2089725/events.out.tfevents.1613411037.Beaver.418259.5
new file mode 100644
index 0000000000000000000000000000000000000000..8b2bbfc136bc36e5637358296cb1d79dc38316a3
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/1613411037.2089725/events.out.tfevents.1613411037.Beaver.418259.5 differ
diff --git a/runs/Feb15_12-43-57_Beaver/1613411037.2617972/events.out.tfevents.1613411037.Beaver.418259.7 b/runs/Feb15_12-43-57_Beaver/1613411037.2617972/events.out.tfevents.1613411037.Beaver.418259.7
new file mode 100644
index 0000000000000000000000000000000000000000..fb15583065d30757438cd24d7c73433215732d06
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/1613411037.2617972/events.out.tfevents.1613411037.Beaver.418259.7 differ
diff --git a/runs/Feb15_12-43-57_Beaver/1613411037.474745/events.out.tfevents.1613411037.Beaver.418259.9 b/runs/Feb15_12-43-57_Beaver/1613411037.474745/events.out.tfevents.1613411037.Beaver.418259.9
new file mode 100644
index 0000000000000000000000000000000000000000..3d78976e3ef2f687d3cf304bf8c2a92550cc88a8
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/1613411037.474745/events.out.tfevents.1613411037.Beaver.418259.9 differ
diff --git a/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.2 b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.2
new file mode 100644
index 0000000000000000000000000000000000000000..e8b749dca7d1fe842418bfa642b0d948eb930b92
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.2 differ
diff --git a/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.4 b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.4
new file mode 100644
index 0000000000000000000000000000000000000000..e88e483ca72b83d0892f626fb16d518beb79dc44
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.4 differ
diff --git a/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.6 b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.6
new file mode 100644
index 0000000000000000000000000000000000000000..b762aef99975b09d2897498c2945c578f8b87865
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.6 differ
diff --git a/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.8 b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.8
new file mode 100644
index 0000000000000000000000000000000000000000..adf34c58cb918e0a980a983f46883fa5507cac82
Binary files /dev/null and b/runs/Feb15_12-43-57_Beaver/events.out.tfevents.1613411037.Beaver.418259.8 differ
diff --git a/runs/May25_09-58-37_Beaver/1621929520.2667465/events.out.tfevents.1621929520.Beaver.140926.1 b/runs/May25_09-58-37_Beaver/1621929520.2667465/events.out.tfevents.1621929520.Beaver.140926.1
new file mode 100644
index 0000000000000000000000000000000000000000..132ccfc030fcc484d2bc35f673bd20f615e9d8d3
Binary files /dev/null and b/runs/May25_09-58-37_Beaver/1621929520.2667465/events.out.tfevents.1621929520.Beaver.140926.1 differ
diff --git a/runs/May25_09-58-37_Beaver/events.out.tfevents.1621929520.Beaver.140926.0 b/runs/May25_09-58-37_Beaver/events.out.tfevents.1621929520.Beaver.140926.0
new file mode 100644
index 0000000000000000000000000000000000000000..393441b3794f2d00aec638ebbdc950504456345b
Binary files /dev/null and b/runs/May25_09-58-37_Beaver/events.out.tfevents.1621929520.Beaver.140926.0 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.30502/events.out.tfevents.1621929520.Beaver.140926.3 b/runs/May25_09-58-40_Beaver/1621929520.30502/events.out.tfevents.1621929520.Beaver.140926.3
new file mode 100644
index 0000000000000000000000000000000000000000..7eaf6d56313e2d8408384d5e55d4801a39946141
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.30502/events.out.tfevents.1621929520.Beaver.140926.3 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.3385923/events.out.tfevents.1621929520.Beaver.140926.5 b/runs/May25_09-58-40_Beaver/1621929520.3385923/events.out.tfevents.1621929520.Beaver.140926.5
new file mode 100644
index 0000000000000000000000000000000000000000..e0fa4803ab3409df521d291a13fa1bb7b4db7fd0
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.3385923/events.out.tfevents.1621929520.Beaver.140926.5 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.4424307/events.out.tfevents.1621929520.Beaver.140926.7 b/runs/May25_09-58-40_Beaver/1621929520.4424307/events.out.tfevents.1621929520.Beaver.140926.7
new file mode 100644
index 0000000000000000000000000000000000000000..dcbd233afb73c99b9bbbc1eeda1fb2e2583f532e
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.4424307/events.out.tfevents.1621929520.Beaver.140926.7 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.5577512/events.out.tfevents.1621929520.Beaver.140926.9 b/runs/May25_09-58-40_Beaver/1621929520.5577512/events.out.tfevents.1621929520.Beaver.140926.9
new file mode 100644
index 0000000000000000000000000000000000000000..8a6d95a620471fcbdab1e7a39b926b3b59afe435
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.5577512/events.out.tfevents.1621929520.Beaver.140926.9 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.6550326/events.out.tfevents.1621929520.Beaver.140926.11 b/runs/May25_09-58-40_Beaver/1621929520.6550326/events.out.tfevents.1621929520.Beaver.140926.11
new file mode 100644
index 0000000000000000000000000000000000000000..30946d6d5eb8fa4abe2e9301f8f741106e52bdd9
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.6550326/events.out.tfevents.1621929520.Beaver.140926.11 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.7905831/events.out.tfevents.1621929520.Beaver.140926.13 b/runs/May25_09-58-40_Beaver/1621929520.7905831/events.out.tfevents.1621929520.Beaver.140926.13
new file mode 100644
index 0000000000000000000000000000000000000000..79e5a558c592f40f2a8ac5aaf496f0d9f6043ec4
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.7905831/events.out.tfevents.1621929520.Beaver.140926.13 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.8967938/events.out.tfevents.1621929520.Beaver.140926.15 b/runs/May25_09-58-40_Beaver/1621929520.8967938/events.out.tfevents.1621929520.Beaver.140926.15
new file mode 100644
index 0000000000000000000000000000000000000000..b6893408868b89c915c9b3df733ecc5f0dc8f602
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.8967938/events.out.tfevents.1621929520.Beaver.140926.15 differ
diff --git a/runs/May25_09-58-40_Beaver/1621929520.972776/events.out.tfevents.1621929520.Beaver.140926.17 b/runs/May25_09-58-40_Beaver/1621929520.972776/events.out.tfevents.1621929520.Beaver.140926.17
new file mode 100644
index 0000000000000000000000000000000000000000..05cf8e959d6907a0813f4ccc385120c494d903b3
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/1621929520.972776/events.out.tfevents.1621929520.Beaver.140926.17 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.10 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.10
new file mode 100644
index 0000000000000000000000000000000000000000..d0fceed02a9abc0dc065fbaf4378b5aaa42e5690
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.10 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.12 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.12
new file mode 100644
index 0000000000000000000000000000000000000000..5b143bd1f70ff39e338df83216c359230b0ed53c
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.12 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.14 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.14
new file mode 100644
index 0000000000000000000000000000000000000000..eb57a1da42cf9eafb32533228993c4c33c08ee26
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.14 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.16 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.16
new file mode 100644
index 0000000000000000000000000000000000000000..f7348618a1a904b0d3e62ddf3e42e5be6c7c4212
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.16 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.2 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.2
new file mode 100644
index 0000000000000000000000000000000000000000..6038236fcd6e7acbef28e7da0dc02248bdae7d43
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.2 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.4 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.4
new file mode 100644
index 0000000000000000000000000000000000000000..05f6dcfebdc79044a4789373306bd5e1e0741d49
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.4 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.6 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.6
new file mode 100644
index 0000000000000000000000000000000000000000..ffd9293321dd82a17ad05f75fdeb1d4bf11306a2
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.6 differ
diff --git a/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.8 b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.8
new file mode 100644
index 0000000000000000000000000000000000000000..d39c6fe5eefd4e498f8adcb9a664d645e7ceed99
Binary files /dev/null and b/runs/May25_09-58-40_Beaver/events.out.tfevents.1621929520.Beaver.140926.8 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.0160458/events.out.tfevents.1621929521.Beaver.140926.19 b/runs/May25_09-58-41_Beaver/1621929521.0160458/events.out.tfevents.1621929521.Beaver.140926.19
new file mode 100644
index 0000000000000000000000000000000000000000..55f8206c760cd580be81258ad6246380a42ae391
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.0160458/events.out.tfevents.1621929521.Beaver.140926.19 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.0619283/events.out.tfevents.1621929521.Beaver.140926.21 b/runs/May25_09-58-41_Beaver/1621929521.0619283/events.out.tfevents.1621929521.Beaver.140926.21
new file mode 100644
index 0000000000000000000000000000000000000000..8c2d34ee88b449fdf0754c1cef029c7f27013296
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.0619283/events.out.tfevents.1621929521.Beaver.140926.21 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.1005495/events.out.tfevents.1621929521.Beaver.140926.23 b/runs/May25_09-58-41_Beaver/1621929521.1005495/events.out.tfevents.1621929521.Beaver.140926.23
new file mode 100644
index 0000000000000000000000000000000000000000..edd0ffe218dc998f967a23eed3807e51f602e219
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.1005495/events.out.tfevents.1621929521.Beaver.140926.23 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.1390715/events.out.tfevents.1621929521.Beaver.140926.25 b/runs/May25_09-58-41_Beaver/1621929521.1390715/events.out.tfevents.1621929521.Beaver.140926.25
new file mode 100644
index 0000000000000000000000000000000000000000..f8744458a62e7320a4f13040bc7eaf078bcba28c
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.1390715/events.out.tfevents.1621929521.Beaver.140926.25 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.1741974/events.out.tfevents.1621929521.Beaver.140926.27 b/runs/May25_09-58-41_Beaver/1621929521.1741974/events.out.tfevents.1621929521.Beaver.140926.27
new file mode 100644
index 0000000000000000000000000000000000000000..6423b73ffdff1aa1eddbd1ff6ca029f9d3885ed8
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.1741974/events.out.tfevents.1621929521.Beaver.140926.27 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.211107/events.out.tfevents.1621929521.Beaver.140926.29 b/runs/May25_09-58-41_Beaver/1621929521.211107/events.out.tfevents.1621929521.Beaver.140926.29
new file mode 100644
index 0000000000000000000000000000000000000000..da2bf5aceb55994d2a4ffd51d8c7a152d31f0559
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.211107/events.out.tfevents.1621929521.Beaver.140926.29 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.3070912/events.out.tfevents.1621929521.Beaver.140926.33 b/runs/May25_09-58-41_Beaver/1621929521.3070912/events.out.tfevents.1621929521.Beaver.140926.33
new file mode 100644
index 0000000000000000000000000000000000000000..d0d319343a52597063d3f8f935f17107d61b4060
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.3070912/events.out.tfevents.1621929521.Beaver.140926.33 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.3477812/events.out.tfevents.1621929521.Beaver.140926.35 b/runs/May25_09-58-41_Beaver/1621929521.3477812/events.out.tfevents.1621929521.Beaver.140926.35
new file mode 100644
index 0000000000000000000000000000000000000000..d96b8c74ef88a616623301050ead954eb777f01d
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.3477812/events.out.tfevents.1621929521.Beaver.140926.35 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.3893588/events.out.tfevents.1621929521.Beaver.140926.37 b/runs/May25_09-58-41_Beaver/1621929521.3893588/events.out.tfevents.1621929521.Beaver.140926.37
new file mode 100644
index 0000000000000000000000000000000000000000..fe291b9dab078edb792975b7b7f9df134f4b9823
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.3893588/events.out.tfevents.1621929521.Beaver.140926.37 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.4362571/events.out.tfevents.1621929521.Beaver.140926.39 b/runs/May25_09-58-41_Beaver/1621929521.4362571/events.out.tfevents.1621929521.Beaver.140926.39
new file mode 100644
index 0000000000000000000000000000000000000000..a072d84b8ed6c4112c35f18d5c776eafbc7ca33c
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.4362571/events.out.tfevents.1621929521.Beaver.140926.39 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.4462042/events.out.tfevents.1621929521.Beaver.140926.41 b/runs/May25_09-58-41_Beaver/1621929521.4462042/events.out.tfevents.1621929521.Beaver.140926.41
new file mode 100644
index 0000000000000000000000000000000000000000..ccc8af748112732356b0586a75e79456a90f0b71
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.4462042/events.out.tfevents.1621929521.Beaver.140926.41 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.4867842/events.out.tfevents.1621929521.Beaver.140926.43 b/runs/May25_09-58-41_Beaver/1621929521.4867842/events.out.tfevents.1621929521.Beaver.140926.43
new file mode 100644
index 0000000000000000000000000000000000000000..7f194665f657fe5fffae0ad6e3340e4693e6024c
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.4867842/events.out.tfevents.1621929521.Beaver.140926.43 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.55299/events.out.tfevents.1621929521.Beaver.140926.47 b/runs/May25_09-58-41_Beaver/1621929521.55299/events.out.tfevents.1621929521.Beaver.140926.47
new file mode 100644
index 0000000000000000000000000000000000000000..618e68d4918dfc53536321d15036f86c8a49df57
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.55299/events.out.tfevents.1621929521.Beaver.140926.47 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.5932963/events.out.tfevents.1621929521.Beaver.140926.49 b/runs/May25_09-58-41_Beaver/1621929521.5932963/events.out.tfevents.1621929521.Beaver.140926.49
new file mode 100644
index 0000000000000000000000000000000000000000..a7ce8de2bba2cea8fb6c7faaebef883a966ed344
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.5932963/events.out.tfevents.1621929521.Beaver.140926.49 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.6587298/events.out.tfevents.1621929521.Beaver.140926.52 b/runs/May25_09-58-41_Beaver/1621929521.6587298/events.out.tfevents.1621929521.Beaver.140926.52
new file mode 100644
index 0000000000000000000000000000000000000000..f1dc313043aaa783d03ee203d3c5fee95615bbe0
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.6587298/events.out.tfevents.1621929521.Beaver.140926.52 differ
diff --git a/runs/May25_09-58-41_Beaver/1621929521.6989424/events.out.tfevents.1621929521.Beaver.140926.54 b/runs/May25_09-58-41_Beaver/1621929521.6989424/events.out.tfevents.1621929521.Beaver.140926.54
new file mode 100644
index 0000000000000000000000000000000000000000..6a4ee19fe1b81ca80c4ca4bc4a5e8d296516b3ce
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/1621929521.6989424/events.out.tfevents.1621929521.Beaver.140926.54 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.18 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.18
new file mode 100644
index 0000000000000000000000000000000000000000..9a70691fb527fc87bdba597c1985f9a4e21a52bd
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.18 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.20 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.20
new file mode 100644
index 0000000000000000000000000000000000000000..9ca45d117dc3646734c694766c03f60f1a5c8834
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.20 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.22 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.22
new file mode 100644
index 0000000000000000000000000000000000000000..31b2652c197cd64a367e159b974cf517c3c2376b
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.22 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.24 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.24
new file mode 100644
index 0000000000000000000000000000000000000000..6df96019e21fca60e8be22eab0ebd0e9f9ba1e2a
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.24 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.26 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.26
new file mode 100644
index 0000000000000000000000000000000000000000..fca1f3a4436659a0844a061d83e82760303de6a4
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.26 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.28 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.28
new file mode 100644
index 0000000000000000000000000000000000000000..5b8e32fa11b736f9d7d4f46f4b9f11e12e5d71fa
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.28 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.30 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.30
new file mode 100644
index 0000000000000000000000000000000000000000..b4ead84f48b7da4bed33e9307319640be2f4f76a
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.30 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.31 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.31
new file mode 100644
index 0000000000000000000000000000000000000000..e857c1073e8b39a9033e663660ece05bb26cacfb
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.31 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.32 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.32
new file mode 100644
index 0000000000000000000000000000000000000000..1a84318b4d06c30ca5c04559a366a180257b1315
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.32 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.34 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.34
new file mode 100644
index 0000000000000000000000000000000000000000..b81e3590ba70600288cf0d7b9e5c21bfbbc0afa8
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.34 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.36 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.36
new file mode 100644
index 0000000000000000000000000000000000000000..ad67c132a8516988db0d83a1f954b4e579c2782f
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.36 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.38 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.38
new file mode 100644
index 0000000000000000000000000000000000000000..d076e7599d48ca8b04466fac4900336c160b304a
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.38 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.40 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.40
new file mode 100644
index 0000000000000000000000000000000000000000..b598a3e06aec6845dc035ea3b59780fca17b705b
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.40 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.42 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.42
new file mode 100644
index 0000000000000000000000000000000000000000..608e7e3ca9dc33d61a4314730578f7b81119cec0
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.42 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.44 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.44
new file mode 100644
index 0000000000000000000000000000000000000000..533e9adc1ae06e9f88263b555525991569b4c2c0
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.44 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.45 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.45
new file mode 100644
index 0000000000000000000000000000000000000000..235f0134ca63fbddd0c93aaaa2b11c5716385318
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.45 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.46 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.46
new file mode 100644
index 0000000000000000000000000000000000000000..3df1f6f4475aa90765a07759491afff3b1cdd2b2
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.46 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.48 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.48
new file mode 100644
index 0000000000000000000000000000000000000000..94cf2096fc4e9b54807c9b185975c874174d369b
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.48 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.50 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.50
new file mode 100644
index 0000000000000000000000000000000000000000..d6372901393a2fbce4beb6d579ae9520b023aa8a
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.50 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.51 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.51
new file mode 100644
index 0000000000000000000000000000000000000000..5b2d64aae68973a16a74b23b4b6a928d39d4ff20
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.51 differ
diff --git a/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.53 b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.53
new file mode 100644
index 0000000000000000000000000000000000000000..81145d56152b0b7d91fa7a8d04da2042e8e61acd
Binary files /dev/null and b/runs/May25_09-58-41_Beaver/events.out.tfevents.1621929521.Beaver.140926.53 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.0873444/events.out.tfevents.1621929522.Beaver.140926.56 b/runs/May25_09-58-42_Beaver/1621929522.0873444/events.out.tfevents.1621929522.Beaver.140926.56
new file mode 100644
index 0000000000000000000000000000000000000000..6d67d3b871e365b45d06bdb4aa1905c0bf86a909
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.0873444/events.out.tfevents.1621929522.Beaver.140926.56 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.125773/events.out.tfevents.1621929522.Beaver.140926.58 b/runs/May25_09-58-42_Beaver/1621929522.125773/events.out.tfevents.1621929522.Beaver.140926.58
new file mode 100644
index 0000000000000000000000000000000000000000..c672c723d0f6e4f3af5964e3d77f96ee14e3938b
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.125773/events.out.tfevents.1621929522.Beaver.140926.58 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.1686683/events.out.tfevents.1621929522.Beaver.140926.60 b/runs/May25_09-58-42_Beaver/1621929522.1686683/events.out.tfevents.1621929522.Beaver.140926.60
new file mode 100644
index 0000000000000000000000000000000000000000..e586608b9831362f7189c6c38fa598cb355defee
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.1686683/events.out.tfevents.1621929522.Beaver.140926.60 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.2041962/events.out.tfevents.1621929522.Beaver.140926.62 b/runs/May25_09-58-42_Beaver/1621929522.2041962/events.out.tfevents.1621929522.Beaver.140926.62
new file mode 100644
index 0000000000000000000000000000000000000000..bec6eb0e7a9ea29be6496fabee9bb13d69d6861a
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.2041962/events.out.tfevents.1621929522.Beaver.140926.62 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.246403/events.out.tfevents.1621929522.Beaver.140926.64 b/runs/May25_09-58-42_Beaver/1621929522.246403/events.out.tfevents.1621929522.Beaver.140926.64
new file mode 100644
index 0000000000000000000000000000000000000000..e11386935c1dc27e3363cad779ae64a55a942b50
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.246403/events.out.tfevents.1621929522.Beaver.140926.64 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.3852613/events.out.tfevents.1621929522.Beaver.140926.67 b/runs/May25_09-58-42_Beaver/1621929522.3852613/events.out.tfevents.1621929522.Beaver.140926.67
new file mode 100644
index 0000000000000000000000000000000000000000..5f626f5db874a95d928740522d91ca29dc48b75f
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.3852613/events.out.tfevents.1621929522.Beaver.140926.67 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.42572/events.out.tfevents.1621929522.Beaver.140926.69 b/runs/May25_09-58-42_Beaver/1621929522.42572/events.out.tfevents.1621929522.Beaver.140926.69
new file mode 100644
index 0000000000000000000000000000000000000000..9c03bd0d1c248eb3a43141660df1d09a933376dc
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.42572/events.out.tfevents.1621929522.Beaver.140926.69 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.4651358/events.out.tfevents.1621929522.Beaver.140926.71 b/runs/May25_09-58-42_Beaver/1621929522.4651358/events.out.tfevents.1621929522.Beaver.140926.71
new file mode 100644
index 0000000000000000000000000000000000000000..a74870cb411275e137fcda5ce1c26eaf4c7a83e6
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.4651358/events.out.tfevents.1621929522.Beaver.140926.71 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.5098524/events.out.tfevents.1621929522.Beaver.140926.73 b/runs/May25_09-58-42_Beaver/1621929522.5098524/events.out.tfevents.1621929522.Beaver.140926.73
new file mode 100644
index 0000000000000000000000000000000000000000..2ef0e60344424a30aa67daef996887aa6624afbf
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.5098524/events.out.tfevents.1621929522.Beaver.140926.73 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.5454214/events.out.tfevents.1621929522.Beaver.140926.75 b/runs/May25_09-58-42_Beaver/1621929522.5454214/events.out.tfevents.1621929522.Beaver.140926.75
new file mode 100644
index 0000000000000000000000000000000000000000..836e2b059bac6b48d282525320f553e691b08755
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.5454214/events.out.tfevents.1621929522.Beaver.140926.75 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.5821316/events.out.tfevents.1621929522.Beaver.140926.77 b/runs/May25_09-58-42_Beaver/1621929522.5821316/events.out.tfevents.1621929522.Beaver.140926.77
new file mode 100644
index 0000000000000000000000000000000000000000..91cd71b9efc16b12055f10378fbcd24efa505e7b
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.5821316/events.out.tfevents.1621929522.Beaver.140926.77 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.6771054/events.out.tfevents.1621929522.Beaver.140926.80 b/runs/May25_09-58-42_Beaver/1621929522.6771054/events.out.tfevents.1621929522.Beaver.140926.80
new file mode 100644
index 0000000000000000000000000000000000000000..541d6565a672f22a88be313e368caada26dcc6a4
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.6771054/events.out.tfevents.1621929522.Beaver.140926.80 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.8371618/events.out.tfevents.1621929522.Beaver.140926.83 b/runs/May25_09-58-42_Beaver/1621929522.8371618/events.out.tfevents.1621929522.Beaver.140926.83
new file mode 100644
index 0000000000000000000000000000000000000000..e4352838dbd51773928b3560b7224b6059d3d5ef
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.8371618/events.out.tfevents.1621929522.Beaver.140926.83 differ
diff --git a/runs/May25_09-58-42_Beaver/1621929522.9744604/events.out.tfevents.1621929522.Beaver.140926.86 b/runs/May25_09-58-42_Beaver/1621929522.9744604/events.out.tfevents.1621929522.Beaver.140926.86
new file mode 100644
index 0000000000000000000000000000000000000000..d3aca06bead67b672c95d04957e541acd1b4c616
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/1621929522.9744604/events.out.tfevents.1621929522.Beaver.140926.86 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.55 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.55
new file mode 100644
index 0000000000000000000000000000000000000000..a8dc26f600117ba2fc3d4b97943b7020f5b6fbe3
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.55 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.57 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.57
new file mode 100644
index 0000000000000000000000000000000000000000..26cca019b0d97123b5e13d69b4b401cff3c72802
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.57 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.59 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.59
new file mode 100644
index 0000000000000000000000000000000000000000..8dc62960ad155850aafe9e0db934b861b3c286dc
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.59 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.61 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.61
new file mode 100644
index 0000000000000000000000000000000000000000..85c4766866192a40571c4124aa5ee87127e3ea73
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.61 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.63 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.63
new file mode 100644
index 0000000000000000000000000000000000000000..a3800550dbd9c7141430a13fef841c2a50f8fae8
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.63 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.65 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.65
new file mode 100644
index 0000000000000000000000000000000000000000..bb3f889a6ea532f959bb0bcde679149976495b32
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.65 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.66 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.66
new file mode 100644
index 0000000000000000000000000000000000000000..90acb02db30080a7c326de1fea6604bd022aeaf9
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.66 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.68 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.68
new file mode 100644
index 0000000000000000000000000000000000000000..43452995f6dfebaa28f654f3254963316e8d82ab
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.68 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.70 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.70
new file mode 100644
index 0000000000000000000000000000000000000000..705bf1bc5c6b12b66f492fae6436e91208cab93b
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.70 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.72 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.72
new file mode 100644
index 0000000000000000000000000000000000000000..fd05326a7c1a176c05214dc05408c9871f682a4c
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.72 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.74 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.74
new file mode 100644
index 0000000000000000000000000000000000000000..2ada4e36e22bc0415a6e8709d5630020cae21a43
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.74 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.76 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.76
new file mode 100644
index 0000000000000000000000000000000000000000..11df44cc4a71d43df00ee500f7d01a04baf1b444
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.76 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.78 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.78
new file mode 100644
index 0000000000000000000000000000000000000000..714abd27608f56fb41c298070cd2400c66c9a50c
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.78 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.79 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.79
new file mode 100644
index 0000000000000000000000000000000000000000..8f732281d307ef9a239654e48e335cb1e9b6d25d
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.79 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.81 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.81
new file mode 100644
index 0000000000000000000000000000000000000000..ad4454fb8b0990621a385d98e4dbc7111b0a2e84
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.81 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.82 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.82
new file mode 100644
index 0000000000000000000000000000000000000000..4bf0e7d346695164f97830c4e433d9a31f148c20
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.82 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.84 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.84
new file mode 100644
index 0000000000000000000000000000000000000000..c6f7d2203008e98524d200d78de0870030326fa7
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.84 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.85 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.85
new file mode 100644
index 0000000000000000000000000000000000000000..601c96a9db49bac773430420227d42dcc77f8fbe
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929522.Beaver.140926.85 differ
diff --git a/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929523.Beaver.140926.87 b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929523.Beaver.140926.87
new file mode 100644
index 0000000000000000000000000000000000000000..98f6ee07037a44cb58142c9576cf1aa2ee4e748d
Binary files /dev/null and b/runs/May25_09-58-42_Beaver/events.out.tfevents.1621929523.Beaver.140926.87 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.103434/events.out.tfevents.1621929523.Beaver.140926.89 b/runs/May25_09-58-43_Beaver/1621929523.103434/events.out.tfevents.1621929523.Beaver.140926.89
new file mode 100644
index 0000000000000000000000000000000000000000..66f14effa2838b91dd5cbbc53959942be15682d2
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.103434/events.out.tfevents.1621929523.Beaver.140926.89 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.145813/events.out.tfevents.1621929523.Beaver.140926.91 b/runs/May25_09-58-43_Beaver/1621929523.145813/events.out.tfevents.1621929523.Beaver.140926.91
new file mode 100644
index 0000000000000000000000000000000000000000..0a3be1e4e90d2edc2eb76e73f4ad1b830a4a22b1
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.145813/events.out.tfevents.1621929523.Beaver.140926.91 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.1874003/events.out.tfevents.1621929523.Beaver.140926.93 b/runs/May25_09-58-43_Beaver/1621929523.1874003/events.out.tfevents.1621929523.Beaver.140926.93
new file mode 100644
index 0000000000000000000000000000000000000000..51cf9906a238a687fb20b59e2ef03ea4a584db38
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.1874003/events.out.tfevents.1621929523.Beaver.140926.93 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.2311344/events.out.tfevents.1621929523.Beaver.140926.95 b/runs/May25_09-58-43_Beaver/1621929523.2311344/events.out.tfevents.1621929523.Beaver.140926.95
new file mode 100644
index 0000000000000000000000000000000000000000..34a2116b9769ee2cdd8691a3c53cabcba57face8
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.2311344/events.out.tfevents.1621929523.Beaver.140926.95 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.2765958/events.out.tfevents.1621929523.Beaver.140926.97 b/runs/May25_09-58-43_Beaver/1621929523.2765958/events.out.tfevents.1621929523.Beaver.140926.97
new file mode 100644
index 0000000000000000000000000000000000000000..61a21a4aca7038d52f3926e0866e205849dab790
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.2765958/events.out.tfevents.1621929523.Beaver.140926.97 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.3203046/events.out.tfevents.1621929523.Beaver.140926.99 b/runs/May25_09-58-43_Beaver/1621929523.3203046/events.out.tfevents.1621929523.Beaver.140926.99
new file mode 100644
index 0000000000000000000000000000000000000000..ec3e1bc511a2ef1a60e54cdc97a87273d68ab378
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.3203046/events.out.tfevents.1621929523.Beaver.140926.99 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.3610222/events.out.tfevents.1621929523.Beaver.140926.101 b/runs/May25_09-58-43_Beaver/1621929523.3610222/events.out.tfevents.1621929523.Beaver.140926.101
new file mode 100644
index 0000000000000000000000000000000000000000..0ef0c5d48cee4f9b0bb2c9ac838101a957df1116
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.3610222/events.out.tfevents.1621929523.Beaver.140926.101 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.3983119/events.out.tfevents.1621929523.Beaver.140926.103 b/runs/May25_09-58-43_Beaver/1621929523.3983119/events.out.tfevents.1621929523.Beaver.140926.103
new file mode 100644
index 0000000000000000000000000000000000000000..9de1d896252d82721a8a93db1f96509d8b650840
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.3983119/events.out.tfevents.1621929523.Beaver.140926.103 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.4384258/events.out.tfevents.1621929523.Beaver.140926.105 b/runs/May25_09-58-43_Beaver/1621929523.4384258/events.out.tfevents.1621929523.Beaver.140926.105
new file mode 100644
index 0000000000000000000000000000000000000000..481fb67e3e3eaa47c08d2a33cd63506fa64a49a9
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.4384258/events.out.tfevents.1621929523.Beaver.140926.105 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.4915836/events.out.tfevents.1621929523.Beaver.140926.107 b/runs/May25_09-58-43_Beaver/1621929523.4915836/events.out.tfevents.1621929523.Beaver.140926.107
new file mode 100644
index 0000000000000000000000000000000000000000..408ef8d22bc73bd5c1d92d9e92c92cba7b72c989
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.4915836/events.out.tfevents.1621929523.Beaver.140926.107 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.5408142/events.out.tfevents.1621929523.Beaver.140926.109 b/runs/May25_09-58-43_Beaver/1621929523.5408142/events.out.tfevents.1621929523.Beaver.140926.109
new file mode 100644
index 0000000000000000000000000000000000000000..a51403640cd03abf63c0771420393dc8c422de46
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.5408142/events.out.tfevents.1621929523.Beaver.140926.109 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.5900943/events.out.tfevents.1621929523.Beaver.140926.111 b/runs/May25_09-58-43_Beaver/1621929523.5900943/events.out.tfevents.1621929523.Beaver.140926.111
new file mode 100644
index 0000000000000000000000000000000000000000..db982d0fb2f3515076b13fc29c9ca2d96bf5e762
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.5900943/events.out.tfevents.1621929523.Beaver.140926.111 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.6283345/events.out.tfevents.1621929523.Beaver.140926.113 b/runs/May25_09-58-43_Beaver/1621929523.6283345/events.out.tfevents.1621929523.Beaver.140926.113
new file mode 100644
index 0000000000000000000000000000000000000000..3a35ceb24f219064d9c7fac3218a92803b4d4605
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.6283345/events.out.tfevents.1621929523.Beaver.140926.113 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.6451433/events.out.tfevents.1621929523.Beaver.140926.115 b/runs/May25_09-58-43_Beaver/1621929523.6451433/events.out.tfevents.1621929523.Beaver.140926.115
new file mode 100644
index 0000000000000000000000000000000000000000..82e4f4ff162222b73d2c86352b770e8dc77565e3
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.6451433/events.out.tfevents.1621929523.Beaver.140926.115 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.6670148/events.out.tfevents.1621929523.Beaver.140926.117 b/runs/May25_09-58-43_Beaver/1621929523.6670148/events.out.tfevents.1621929523.Beaver.140926.117
new file mode 100644
index 0000000000000000000000000000000000000000..4354df93e892bc3c6dde1022366138c5c2f30eac
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.6670148/events.out.tfevents.1621929523.Beaver.140926.117 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.7080624/events.out.tfevents.1621929523.Beaver.140926.119 b/runs/May25_09-58-43_Beaver/1621929523.7080624/events.out.tfevents.1621929523.Beaver.140926.119
new file mode 100644
index 0000000000000000000000000000000000000000..64362493730efadcf24e50dfda5b0deba6aadaaf
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.7080624/events.out.tfevents.1621929523.Beaver.140926.119 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.7512586/events.out.tfevents.1621929523.Beaver.140926.121 b/runs/May25_09-58-43_Beaver/1621929523.7512586/events.out.tfevents.1621929523.Beaver.140926.121
new file mode 100644
index 0000000000000000000000000000000000000000..f4c39f328d89aa21e1347dfb05711a17ac8659db
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.7512586/events.out.tfevents.1621929523.Beaver.140926.121 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.7888353/events.out.tfevents.1621929523.Beaver.140926.123 b/runs/May25_09-58-43_Beaver/1621929523.7888353/events.out.tfevents.1621929523.Beaver.140926.123
new file mode 100644
index 0000000000000000000000000000000000000000..6209938564afaca9a6ec227c57f0f3723c6781d7
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.7888353/events.out.tfevents.1621929523.Beaver.140926.123 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.8130193/events.out.tfevents.1621929523.Beaver.140926.125 b/runs/May25_09-58-43_Beaver/1621929523.8130193/events.out.tfevents.1621929523.Beaver.140926.125
new file mode 100644
index 0000000000000000000000000000000000000000..9d28ef4a0f5005f6a4a1fbe3e8c85ea19e7d8404
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.8130193/events.out.tfevents.1621929523.Beaver.140926.125 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.8340597/events.out.tfevents.1621929523.Beaver.140926.127 b/runs/May25_09-58-43_Beaver/1621929523.8340597/events.out.tfevents.1621929523.Beaver.140926.127
new file mode 100644
index 0000000000000000000000000000000000000000..1ddd3d5c095dec8a7778e00ab7f8c5d2c04c4124
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.8340597/events.out.tfevents.1621929523.Beaver.140926.127 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.8695621/events.out.tfevents.1621929523.Beaver.140926.129 b/runs/May25_09-58-43_Beaver/1621929523.8695621/events.out.tfevents.1621929523.Beaver.140926.129
new file mode 100644
index 0000000000000000000000000000000000000000..46236b3b07f7eabfabcaa6b0e815007e3f7cb0e4
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.8695621/events.out.tfevents.1621929523.Beaver.140926.129 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.944521/events.out.tfevents.1621929523.Beaver.140926.131 b/runs/May25_09-58-43_Beaver/1621929523.944521/events.out.tfevents.1621929523.Beaver.140926.131
new file mode 100644
index 0000000000000000000000000000000000000000..0a9d348d7ef722ae79138ad4d865cf0ad591547f
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.944521/events.out.tfevents.1621929523.Beaver.140926.131 differ
diff --git a/runs/May25_09-58-43_Beaver/1621929523.987287/events.out.tfevents.1621929523.Beaver.140926.133 b/runs/May25_09-58-43_Beaver/1621929523.987287/events.out.tfevents.1621929523.Beaver.140926.133
new file mode 100644
index 0000000000000000000000000000000000000000..5142840ee079062ed924e787ad517739a8da6fa1
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/1621929523.987287/events.out.tfevents.1621929523.Beaver.140926.133 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.100 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.100
new file mode 100644
index 0000000000000000000000000000000000000000..83bcebdb32c31af9d43bb0ae4b9653defe3fd923
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.100 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.102 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.102
new file mode 100644
index 0000000000000000000000000000000000000000..a28842a238a3aeca7ba0652406950177335c112e
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.102 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.104 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.104
new file mode 100644
index 0000000000000000000000000000000000000000..0ee14613b314685cb5a7294938166049b9bed00f
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.104 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.106 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.106
new file mode 100644
index 0000000000000000000000000000000000000000..0bea2ff75621057ff70936843491519f1bcf22dc
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.106 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.108 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.108
new file mode 100644
index 0000000000000000000000000000000000000000..de313083514abfd51c7b36caa758404da1b1f5fa
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.108 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.110 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.110
new file mode 100644
index 0000000000000000000000000000000000000000..11c74321bd7b50232fe96c38282fbec8105670c6
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.110 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.112 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.112
new file mode 100644
index 0000000000000000000000000000000000000000..f621707cc641cc660b8ec6c446e02a4b6414dc4a
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.112 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.114 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.114
new file mode 100644
index 0000000000000000000000000000000000000000..bc85cb701018db34690d4085adcb6056635c3a94
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.114 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.116 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.116
new file mode 100644
index 0000000000000000000000000000000000000000..19141e79625d4ca0aa0167636f4ffdb221b25488
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.116 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.118 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.118
new file mode 100644
index 0000000000000000000000000000000000000000..35a94a2f785c4788f6cd0144977f2c56aee1ea98
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.118 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.120 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.120
new file mode 100644
index 0000000000000000000000000000000000000000..57fcaa49559ecf0bec2dce539cf743d0d127accf
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.120 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.122 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.122
new file mode 100644
index 0000000000000000000000000000000000000000..5d5bc4b9103a113e2d5a586dfa21627954932ca2
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.122 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.124 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.124
new file mode 100644
index 0000000000000000000000000000000000000000..0741336e21c9d778c609e66c7f825627d540a3e3
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.124 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.126 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.126
new file mode 100644
index 0000000000000000000000000000000000000000..c49ed893343ab16e635d3cf93ebca9fde0e8b1ff
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.126 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.128 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.128
new file mode 100644
index 0000000000000000000000000000000000000000..27bdf84f56796ee85c986dbd8e9887db942d8976
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.128 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.130 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.130
new file mode 100644
index 0000000000000000000000000000000000000000..5ac23a74e203f379f835e72f63335a5fb39e9e20
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.130 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.132 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.132
new file mode 100644
index 0000000000000000000000000000000000000000..07f9ccc37645562ab047f451306d847eb9123a19
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.132 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.88 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.88
new file mode 100644
index 0000000000000000000000000000000000000000..0e3b3ce91fae79c6236ba541027fe968c78780e6
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.88 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.90 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.90
new file mode 100644
index 0000000000000000000000000000000000000000..ffecee9a76967cd53ad867a5699f9c4c62150aba
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.90 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.92 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.92
new file mode 100644
index 0000000000000000000000000000000000000000..eedafcfe95e7dec6ea7cf8ae0a2401571afbc6e9
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.92 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.94 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.94
new file mode 100644
index 0000000000000000000000000000000000000000..ff013d038dfc09e86d7e78738b7f6e938f2996a4
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.94 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.96 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.96
new file mode 100644
index 0000000000000000000000000000000000000000..a0e19b371aae0f7d576696a868a25d7a6b49aa65
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.96 differ
diff --git a/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.98 b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.98
new file mode 100644
index 0000000000000000000000000000000000000000..088731871d66fa06ce91d23ea05faa59bce93615
Binary files /dev/null and b/runs/May25_09-58-43_Beaver/events.out.tfevents.1621929523.Beaver.140926.98 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.0455928/events.out.tfevents.1621929524.Beaver.140926.135 b/runs/May25_09-58-44_Beaver/1621929524.0455928/events.out.tfevents.1621929524.Beaver.140926.135
new file mode 100644
index 0000000000000000000000000000000000000000..24ac28ce9ff87d145a9062ca1129f9b0dc8f550a
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.0455928/events.out.tfevents.1621929524.Beaver.140926.135 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.0864418/events.out.tfevents.1621929524.Beaver.140926.137 b/runs/May25_09-58-44_Beaver/1621929524.0864418/events.out.tfevents.1621929524.Beaver.140926.137
new file mode 100644
index 0000000000000000000000000000000000000000..119368083dbf8e00ad454401b84e44b41260e502
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.0864418/events.out.tfevents.1621929524.Beaver.140926.137 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.129653/events.out.tfevents.1621929524.Beaver.140926.139 b/runs/May25_09-58-44_Beaver/1621929524.129653/events.out.tfevents.1621929524.Beaver.140926.139
new file mode 100644
index 0000000000000000000000000000000000000000..c27a8218f7c226d410606b6757daa750eeb4bf69
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.129653/events.out.tfevents.1621929524.Beaver.140926.139 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.166109/events.out.tfevents.1621929524.Beaver.140926.141 b/runs/May25_09-58-44_Beaver/1621929524.166109/events.out.tfevents.1621929524.Beaver.140926.141
new file mode 100644
index 0000000000000000000000000000000000000000..c23c701efdcac4bed05e7687df0a67d440490207
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.166109/events.out.tfevents.1621929524.Beaver.140926.141 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.2016873/events.out.tfevents.1621929524.Beaver.140926.143 b/runs/May25_09-58-44_Beaver/1621929524.2016873/events.out.tfevents.1621929524.Beaver.140926.143
new file mode 100644
index 0000000000000000000000000000000000000000..698bda5568280ac9ab07b066146b5506424c395f
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.2016873/events.out.tfevents.1621929524.Beaver.140926.143 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.2405143/events.out.tfevents.1621929524.Beaver.140926.145 b/runs/May25_09-58-44_Beaver/1621929524.2405143/events.out.tfevents.1621929524.Beaver.140926.145
new file mode 100644
index 0000000000000000000000000000000000000000..eae7a7fe90ccac7348ab06284e00eecbe3b49133
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.2405143/events.out.tfevents.1621929524.Beaver.140926.145 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.2798102/events.out.tfevents.1621929524.Beaver.140926.147 b/runs/May25_09-58-44_Beaver/1621929524.2798102/events.out.tfevents.1621929524.Beaver.140926.147
new file mode 100644
index 0000000000000000000000000000000000000000..4ab766b3b2112651b2acabb9401d5142f4f8233b
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.2798102/events.out.tfevents.1621929524.Beaver.140926.147 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.448186/events.out.tfevents.1621929524.Beaver.140926.149 b/runs/May25_09-58-44_Beaver/1621929524.448186/events.out.tfevents.1621929524.Beaver.140926.149
new file mode 100644
index 0000000000000000000000000000000000000000..5924467af35661b55bae3f0d0c39b81f63aeb786
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.448186/events.out.tfevents.1621929524.Beaver.140926.149 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.613718/events.out.tfevents.1621929524.Beaver.140926.151 b/runs/May25_09-58-44_Beaver/1621929524.613718/events.out.tfevents.1621929524.Beaver.140926.151
new file mode 100644
index 0000000000000000000000000000000000000000..2c168c4312a60ce6f88d5e5f76fd43b685cfe5ab
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.613718/events.out.tfevents.1621929524.Beaver.140926.151 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.6483233/events.out.tfevents.1621929524.Beaver.140926.153 b/runs/May25_09-58-44_Beaver/1621929524.6483233/events.out.tfevents.1621929524.Beaver.140926.153
new file mode 100644
index 0000000000000000000000000000000000000000..5658f3307d0226c2a90701950be5344f9a596442
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.6483233/events.out.tfevents.1621929524.Beaver.140926.153 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.6850502/events.out.tfevents.1621929524.Beaver.140926.155 b/runs/May25_09-58-44_Beaver/1621929524.6850502/events.out.tfevents.1621929524.Beaver.140926.155
new file mode 100644
index 0000000000000000000000000000000000000000..eff45d0ba52c5cd6da62083e72dfc82633b8fab0
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.6850502/events.out.tfevents.1621929524.Beaver.140926.155 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.8011994/events.out.tfevents.1621929524.Beaver.140926.157 b/runs/May25_09-58-44_Beaver/1621929524.8011994/events.out.tfevents.1621929524.Beaver.140926.157
new file mode 100644
index 0000000000000000000000000000000000000000..8d6386091d36fd8cf61fd036ca318a42e0a19722
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.8011994/events.out.tfevents.1621929524.Beaver.140926.157 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.9131796/events.out.tfevents.1621929524.Beaver.140926.159 b/runs/May25_09-58-44_Beaver/1621929524.9131796/events.out.tfevents.1621929524.Beaver.140926.159
new file mode 100644
index 0000000000000000000000000000000000000000..bc0810f8315eb17eb979be2d5aa16a68781eddff
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.9131796/events.out.tfevents.1621929524.Beaver.140926.159 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.9491317/events.out.tfevents.1621929524.Beaver.140926.161 b/runs/May25_09-58-44_Beaver/1621929524.9491317/events.out.tfevents.1621929524.Beaver.140926.161
new file mode 100644
index 0000000000000000000000000000000000000000..603449354f2c19b701908147a9b994ec71b8e674
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.9491317/events.out.tfevents.1621929524.Beaver.140926.161 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929524.9837644/events.out.tfevents.1621929524.Beaver.140926.163 b/runs/May25_09-58-44_Beaver/1621929524.9837644/events.out.tfevents.1621929524.Beaver.140926.163
new file mode 100644
index 0000000000000000000000000000000000000000..69bcd187fa2fdcbdfacbb6874a68f8ad9a695248
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929524.9837644/events.out.tfevents.1621929524.Beaver.140926.163 differ
diff --git a/runs/May25_09-58-44_Beaver/1621929525.0798411/events.out.tfevents.1621929525.Beaver.140926.165 b/runs/May25_09-58-44_Beaver/1621929525.0798411/events.out.tfevents.1621929525.Beaver.140926.165
new file mode 100644
index 0000000000000000000000000000000000000000..e3c3eadb0412ebb0721bd74a24ac6d7dd6468ba7
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/1621929525.0798411/events.out.tfevents.1621929525.Beaver.140926.165 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.134 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.134
new file mode 100644
index 0000000000000000000000000000000000000000..6736092872a0e76bf994b4e33362459bc4e700b3
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.134 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.136 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.136
new file mode 100644
index 0000000000000000000000000000000000000000..552c1acc4a766238a3b0b9583dbabbf8ef693328
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.136 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.138 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.138
new file mode 100644
index 0000000000000000000000000000000000000000..2c74591c4f94c740f22febb779ce6059008fd8fc
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.138 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.140 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.140
new file mode 100644
index 0000000000000000000000000000000000000000..d52c7f7164417e446e56ea40debd41739819ea54
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.140 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.142 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.142
new file mode 100644
index 0000000000000000000000000000000000000000..80f9e1d191013d92934b3adf6d740ac88c0b44c0
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.142 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.144 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.144
new file mode 100644
index 0000000000000000000000000000000000000000..90a9358c14b1014d69018cac070768c25304019e
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.144 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.146 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.146
new file mode 100644
index 0000000000000000000000000000000000000000..154a4ec33a1b0e59fe4c3f8b65c224748c11017d
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.146 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.148 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.148
new file mode 100644
index 0000000000000000000000000000000000000000..f42f5966509d36eddbf7d3a4fdacc2ffcb7e2dc2
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.148 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.150 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.150
new file mode 100644
index 0000000000000000000000000000000000000000..f9fbc8c941dc61d9fa746fe96690ae2ad086b99e
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.150 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.152 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.152
new file mode 100644
index 0000000000000000000000000000000000000000..e655c9eb955e618aaf95a965f83d9fdb579e921a
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.152 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.154 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.154
new file mode 100644
index 0000000000000000000000000000000000000000..4a3f84396095419cd21d03a4af2a38c0f7a00cad
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.154 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.156 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.156
new file mode 100644
index 0000000000000000000000000000000000000000..4853afd21f355a9f7347d56cd8008fcd7870206d
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.156 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.158 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.158
new file mode 100644
index 0000000000000000000000000000000000000000..4491699abc0e73ed4c27dacea4e941638b36c8de
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.158 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.160 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.160
new file mode 100644
index 0000000000000000000000000000000000000000..e6e636353eeea82b200f2bc6c1dd4fd3024f4394
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.160 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.162 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.162
new file mode 100644
index 0000000000000000000000000000000000000000..4c4ea531925897fbfd478ef1e1656d565168dd47
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929524.Beaver.140926.162 differ
diff --git a/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929525.Beaver.140926.164 b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929525.Beaver.140926.164
new file mode 100644
index 0000000000000000000000000000000000000000..3c6a381e7307e4be46ccb8fe1ff3a3a5d72eb38b
Binary files /dev/null and b/runs/May25_09-58-44_Beaver/events.out.tfevents.1621929525.Beaver.140926.164 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.1670873/events.out.tfevents.1621929525.Beaver.140926.167 b/runs/May25_09-58-45_Beaver/1621929525.1670873/events.out.tfevents.1621929525.Beaver.140926.167
new file mode 100644
index 0000000000000000000000000000000000000000..dbc2f0a52b138c3bf74189f9def0dcade40ff243
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.1670873/events.out.tfevents.1621929525.Beaver.140926.167 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.2064312/events.out.tfevents.1621929525.Beaver.140926.169 b/runs/May25_09-58-45_Beaver/1621929525.2064312/events.out.tfevents.1621929525.Beaver.140926.169
new file mode 100644
index 0000000000000000000000000000000000000000..b0c309f9a3d4347db10a7faba4b47e70943d31d6
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.2064312/events.out.tfevents.1621929525.Beaver.140926.169 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.2458148/events.out.tfevents.1621929525.Beaver.140926.171 b/runs/May25_09-58-45_Beaver/1621929525.2458148/events.out.tfevents.1621929525.Beaver.140926.171
new file mode 100644
index 0000000000000000000000000000000000000000..463f9690f71d175720920f093962692027292136
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.2458148/events.out.tfevents.1621929525.Beaver.140926.171 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.290004/events.out.tfevents.1621929525.Beaver.140926.173 b/runs/May25_09-58-45_Beaver/1621929525.290004/events.out.tfevents.1621929525.Beaver.140926.173
new file mode 100644
index 0000000000000000000000000000000000000000..798de4a4ae9129cab3feb8dd8c20734c517013bd
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.290004/events.out.tfevents.1621929525.Beaver.140926.173 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.3364563/events.out.tfevents.1621929525.Beaver.140926.175 b/runs/May25_09-58-45_Beaver/1621929525.3364563/events.out.tfevents.1621929525.Beaver.140926.175
new file mode 100644
index 0000000000000000000000000000000000000000..1b5e5a7b86238f829b970dd517c8e29f32815730
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.3364563/events.out.tfevents.1621929525.Beaver.140926.175 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.3724895/events.out.tfevents.1621929525.Beaver.140926.177 b/runs/May25_09-58-45_Beaver/1621929525.3724895/events.out.tfevents.1621929525.Beaver.140926.177
new file mode 100644
index 0000000000000000000000000000000000000000..6aed86caac4fdde3508d1d9b3fa94b5194d8bcdf
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.3724895/events.out.tfevents.1621929525.Beaver.140926.177 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.4163318/events.out.tfevents.1621929525.Beaver.140926.179 b/runs/May25_09-58-45_Beaver/1621929525.4163318/events.out.tfevents.1621929525.Beaver.140926.179
new file mode 100644
index 0000000000000000000000000000000000000000..6627c1869d3297a562f974644c801af374eac29c
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.4163318/events.out.tfevents.1621929525.Beaver.140926.179 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.4486094/events.out.tfevents.1621929525.Beaver.140926.181 b/runs/May25_09-58-45_Beaver/1621929525.4486094/events.out.tfevents.1621929525.Beaver.140926.181
new file mode 100644
index 0000000000000000000000000000000000000000..6795fde757e8856e9e9ca807d711627c5f9fe031
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.4486094/events.out.tfevents.1621929525.Beaver.140926.181 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.5042927/events.out.tfevents.1621929525.Beaver.140926.183 b/runs/May25_09-58-45_Beaver/1621929525.5042927/events.out.tfevents.1621929525.Beaver.140926.183
new file mode 100644
index 0000000000000000000000000000000000000000..2254d2323764f4228fb9826ea92f48e7ebcf1858
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.5042927/events.out.tfevents.1621929525.Beaver.140926.183 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.548137/events.out.tfevents.1621929525.Beaver.140926.185 b/runs/May25_09-58-45_Beaver/1621929525.548137/events.out.tfevents.1621929525.Beaver.140926.185
new file mode 100644
index 0000000000000000000000000000000000000000..f2adad5562dcdb37492b5c851f62393ea026946f
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.548137/events.out.tfevents.1621929525.Beaver.140926.185 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.774989/events.out.tfevents.1621929525.Beaver.140926.187 b/runs/May25_09-58-45_Beaver/1621929525.774989/events.out.tfevents.1621929525.Beaver.140926.187
new file mode 100644
index 0000000000000000000000000000000000000000..1c5a6014ad9bb802f05e5377a16fbcea9cab834a
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.774989/events.out.tfevents.1621929525.Beaver.140926.187 differ
diff --git a/runs/May25_09-58-45_Beaver/1621929525.8153777/events.out.tfevents.1621929525.Beaver.140926.189 b/runs/May25_09-58-45_Beaver/1621929525.8153777/events.out.tfevents.1621929525.Beaver.140926.189
new file mode 100644
index 0000000000000000000000000000000000000000..a5674ca289a104721dcf036d052e41a138f92e90
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/1621929525.8153777/events.out.tfevents.1621929525.Beaver.140926.189 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.166 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.166
new file mode 100644
index 0000000000000000000000000000000000000000..cb18b541172f33ca2dd2ec00899ff4b155c96a2b
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.166 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.168 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.168
new file mode 100644
index 0000000000000000000000000000000000000000..5cb09be28582274d0169e1b47db40cac4c6d2339
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.168 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.170 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.170
new file mode 100644
index 0000000000000000000000000000000000000000..f934de3f34d543ffb329c9dc0c2b65c0384f87c3
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.170 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.172 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.172
new file mode 100644
index 0000000000000000000000000000000000000000..85db9644b5a54f2593d616283a97c7a9de05d3cd
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.172 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.174 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.174
new file mode 100644
index 0000000000000000000000000000000000000000..3ea981f72b7dde126e81e9d34ae6df557dff26e5
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.174 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.176 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.176
new file mode 100644
index 0000000000000000000000000000000000000000..840208a0a341b16229c5e95f15be18be74adb790
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.176 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.178 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.178
new file mode 100644
index 0000000000000000000000000000000000000000..2be2d3f5449f47e732ca7d246ce243993eacfafe
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.178 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.180 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.180
new file mode 100644
index 0000000000000000000000000000000000000000..dd9c05cd98e24ac6dcd776f096959ab06e929394
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.180 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.182 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.182
new file mode 100644
index 0000000000000000000000000000000000000000..b1b61776db0fcf526110ddd6af1de5fec54b62e7
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.182 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.184 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.184
new file mode 100644
index 0000000000000000000000000000000000000000..573146d51a374ac9113d76047659ee595584fb5a
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.184 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.186 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.186
new file mode 100644
index 0000000000000000000000000000000000000000..8dccbbe6c61a9d4eeec86c6e941a2acb90c369ad
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.186 differ
diff --git a/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.188 b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.188
new file mode 100644
index 0000000000000000000000000000000000000000..9cd05615152499046be86c0d9913cceb1d2fe006
Binary files /dev/null and b/runs/May25_09-58-45_Beaver/events.out.tfevents.1621929525.Beaver.140926.188 differ
diff --git a/runs/May25_10-00-23_Beaver/1621929625.9840627/events.out.tfevents.1621929625.Beaver.142246.1 b/runs/May25_10-00-23_Beaver/1621929625.9840627/events.out.tfevents.1621929625.Beaver.142246.1
new file mode 100644
index 0000000000000000000000000000000000000000..08472ee8160ae5a4ef2d10f611a569dd68807dec
Binary files /dev/null and b/runs/May25_10-00-23_Beaver/1621929625.9840627/events.out.tfevents.1621929625.Beaver.142246.1 differ
diff --git a/runs/May25_10-00-23_Beaver/events.out.tfevents.1621929625.Beaver.142246.0 b/runs/May25_10-00-23_Beaver/events.out.tfevents.1621929625.Beaver.142246.0
new file mode 100644
index 0000000000000000000000000000000000000000..56177370ddb40397d0888766aaa565c230c37db7
Binary files /dev/null and b/runs/May25_10-00-23_Beaver/events.out.tfevents.1621929625.Beaver.142246.0 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.0194006/events.out.tfevents.1621929626.Beaver.142246.3 b/runs/May25_10-00-26_Beaver/1621929626.0194006/events.out.tfevents.1621929626.Beaver.142246.3
new file mode 100644
index 0000000000000000000000000000000000000000..952842f6c9e22c39a8f1e6cba9206cbbd2357467
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.0194006/events.out.tfevents.1621929626.Beaver.142246.3 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.050524/events.out.tfevents.1621929626.Beaver.142246.5 b/runs/May25_10-00-26_Beaver/1621929626.050524/events.out.tfevents.1621929626.Beaver.142246.5
new file mode 100644
index 0000000000000000000000000000000000000000..03c66279510efd444bb7231450953d33fb629919
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.050524/events.out.tfevents.1621929626.Beaver.142246.5 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.1312027/events.out.tfevents.1621929626.Beaver.142246.7 b/runs/May25_10-00-26_Beaver/1621929626.1312027/events.out.tfevents.1621929626.Beaver.142246.7
new file mode 100644
index 0000000000000000000000000000000000000000..9776f9b9901c54d955fc3005daabb43758f5a0eb
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.1312027/events.out.tfevents.1621929626.Beaver.142246.7 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.2053585/events.out.tfevents.1621929626.Beaver.142246.9 b/runs/May25_10-00-26_Beaver/1621929626.2053585/events.out.tfevents.1621929626.Beaver.142246.9
new file mode 100644
index 0000000000000000000000000000000000000000..00d052fa1778a2d07914184747342ead7262e1f8
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.2053585/events.out.tfevents.1621929626.Beaver.142246.9 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.2785819/events.out.tfevents.1621929626.Beaver.142246.11 b/runs/May25_10-00-26_Beaver/1621929626.2785819/events.out.tfevents.1621929626.Beaver.142246.11
new file mode 100644
index 0000000000000000000000000000000000000000..8221af39905b823f69e574150f4cb9587b7ada03
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.2785819/events.out.tfevents.1621929626.Beaver.142246.11 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.3553183/events.out.tfevents.1621929626.Beaver.142246.13 b/runs/May25_10-00-26_Beaver/1621929626.3553183/events.out.tfevents.1621929626.Beaver.142246.13
new file mode 100644
index 0000000000000000000000000000000000000000..ea679c72b90aa11ab324a32208bfc9ff9bab811a
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.3553183/events.out.tfevents.1621929626.Beaver.142246.13 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.4262426/events.out.tfevents.1621929626.Beaver.142246.15 b/runs/May25_10-00-26_Beaver/1621929626.4262426/events.out.tfevents.1621929626.Beaver.142246.15
new file mode 100644
index 0000000000000000000000000000000000000000..f444e6b0f617ee9f94c780aa8e4ec1db13ec9d07
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.4262426/events.out.tfevents.1621929626.Beaver.142246.15 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.4900403/events.out.tfevents.1621929626.Beaver.142246.17 b/runs/May25_10-00-26_Beaver/1621929626.4900403/events.out.tfevents.1621929626.Beaver.142246.17
new file mode 100644
index 0000000000000000000000000000000000000000..0e39d7fd7cbf6d0a6a64a590756019621ea87bea
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.4900403/events.out.tfevents.1621929626.Beaver.142246.17 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.5203342/events.out.tfevents.1621929626.Beaver.142246.19 b/runs/May25_10-00-26_Beaver/1621929626.5203342/events.out.tfevents.1621929626.Beaver.142246.19
new file mode 100644
index 0000000000000000000000000000000000000000..d0bf3c61fa23a4717dcdf75f71157b17e02994a9
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.5203342/events.out.tfevents.1621929626.Beaver.142246.19 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.556481/events.out.tfevents.1621929626.Beaver.142246.21 b/runs/May25_10-00-26_Beaver/1621929626.556481/events.out.tfevents.1621929626.Beaver.142246.21
new file mode 100644
index 0000000000000000000000000000000000000000..cf380af361fc9d4a6ab72b471bfa98ba53a50696
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.556481/events.out.tfevents.1621929626.Beaver.142246.21 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.5871282/events.out.tfevents.1621929626.Beaver.142246.23 b/runs/May25_10-00-26_Beaver/1621929626.5871282/events.out.tfevents.1621929626.Beaver.142246.23
new file mode 100644
index 0000000000000000000000000000000000000000..09eca4f8b015dc271a770ce31c5462e022378e9d
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.5871282/events.out.tfevents.1621929626.Beaver.142246.23 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.6166458/events.out.tfevents.1621929626.Beaver.142246.25 b/runs/May25_10-00-26_Beaver/1621929626.6166458/events.out.tfevents.1621929626.Beaver.142246.25
new file mode 100644
index 0000000000000000000000000000000000000000..6b768266b942bcfe59f8330ba528e28bb172986a
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.6166458/events.out.tfevents.1621929626.Beaver.142246.25 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.645574/events.out.tfevents.1621929626.Beaver.142246.27 b/runs/May25_10-00-26_Beaver/1621929626.645574/events.out.tfevents.1621929626.Beaver.142246.27
new file mode 100644
index 0000000000000000000000000000000000000000..2b33264a3a8a1ee446babe5b810312855b824c80
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.645574/events.out.tfevents.1621929626.Beaver.142246.27 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.6773708/events.out.tfevents.1621929626.Beaver.142246.29 b/runs/May25_10-00-26_Beaver/1621929626.6773708/events.out.tfevents.1621929626.Beaver.142246.29
new file mode 100644
index 0000000000000000000000000000000000000000..ee207e40bb9e2dc782b325a251e45964544ae664
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.6773708/events.out.tfevents.1621929626.Beaver.142246.29 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.7424588/events.out.tfevents.1621929626.Beaver.142246.33 b/runs/May25_10-00-26_Beaver/1621929626.7424588/events.out.tfevents.1621929626.Beaver.142246.33
new file mode 100644
index 0000000000000000000000000000000000000000..0029d0bdb88682a6967c5ab6cbd2ce57db9609b9
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.7424588/events.out.tfevents.1621929626.Beaver.142246.33 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.7730799/events.out.tfevents.1621929626.Beaver.142246.35 b/runs/May25_10-00-26_Beaver/1621929626.7730799/events.out.tfevents.1621929626.Beaver.142246.35
new file mode 100644
index 0000000000000000000000000000000000000000..78fa4465dc2df875314ad5896ae3b03ba2236799
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.7730799/events.out.tfevents.1621929626.Beaver.142246.35 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.8034246/events.out.tfevents.1621929626.Beaver.142246.37 b/runs/May25_10-00-26_Beaver/1621929626.8034246/events.out.tfevents.1621929626.Beaver.142246.37
new file mode 100644
index 0000000000000000000000000000000000000000..3578809dc1341048ae9899d96f1162c7075a0390
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.8034246/events.out.tfevents.1621929626.Beaver.142246.37 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.839024/events.out.tfevents.1621929626.Beaver.142246.39 b/runs/May25_10-00-26_Beaver/1621929626.839024/events.out.tfevents.1621929626.Beaver.142246.39
new file mode 100644
index 0000000000000000000000000000000000000000..eb2bed5b70275f24a3bc0a82e0bccf848d21c5c7
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.839024/events.out.tfevents.1621929626.Beaver.142246.39 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.8465173/events.out.tfevents.1621929626.Beaver.142246.41 b/runs/May25_10-00-26_Beaver/1621929626.8465173/events.out.tfevents.1621929626.Beaver.142246.41
new file mode 100644
index 0000000000000000000000000000000000000000..c210578757e2203d25602b640f5f9e98408c51d7
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.8465173/events.out.tfevents.1621929626.Beaver.142246.41 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.878244/events.out.tfevents.1621929626.Beaver.142246.43 b/runs/May25_10-00-26_Beaver/1621929626.878244/events.out.tfevents.1621929626.Beaver.142246.43
new file mode 100644
index 0000000000000000000000000000000000000000..1d69c60d92ad11c00fd54bb2981a0aa9a7853c3e
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.878244/events.out.tfevents.1621929626.Beaver.142246.43 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.9280276/events.out.tfevents.1621929626.Beaver.142246.47 b/runs/May25_10-00-26_Beaver/1621929626.9280276/events.out.tfevents.1621929626.Beaver.142246.47
new file mode 100644
index 0000000000000000000000000000000000000000..2bde7638a81fb661b7b6d814f8e55b0a2d56f086
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.9280276/events.out.tfevents.1621929626.Beaver.142246.47 differ
diff --git a/runs/May25_10-00-26_Beaver/1621929626.9589884/events.out.tfevents.1621929626.Beaver.142246.49 b/runs/May25_10-00-26_Beaver/1621929626.9589884/events.out.tfevents.1621929626.Beaver.142246.49
new file mode 100644
index 0000000000000000000000000000000000000000..67757cde235ab014a11de5a8acbc8e038f9e8aac
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/1621929626.9589884/events.out.tfevents.1621929626.Beaver.142246.49 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.10 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.10
new file mode 100644
index 0000000000000000000000000000000000000000..be33e83cb040ef09e435f4964f4b5e0bf79b1465
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.10 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.12 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.12
new file mode 100644
index 0000000000000000000000000000000000000000..9347b81f3a763539f7285e9737f6d0a2d4b991ab
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.12 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.14 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.14
new file mode 100644
index 0000000000000000000000000000000000000000..9942c3559bb4854f224de16e1a28aa7e950b39f0
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.14 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.16 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.16
new file mode 100644
index 0000000000000000000000000000000000000000..851b1649d19b996e2cc9e2192e7aa5d7738886e5
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.16 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.18 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.18
new file mode 100644
index 0000000000000000000000000000000000000000..389ad331552739c699827693e651db4e38dc2e90
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.18 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.2 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.2
new file mode 100644
index 0000000000000000000000000000000000000000..0e59e8b44ec6dcdb799e6cbfc6d333c8d7d9210b
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.2 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.20 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.20
new file mode 100644
index 0000000000000000000000000000000000000000..42b0ea52fdd5f3e49991edb47d3c3eae18261d53
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.20 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.22 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.22
new file mode 100644
index 0000000000000000000000000000000000000000..2071c7c739e8bbf0b7e22db0a8640860f4f19893
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.22 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.24 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.24
new file mode 100644
index 0000000000000000000000000000000000000000..9ce928fc8180d7573b0973949581c1ba2a897d10
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.24 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.26 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.26
new file mode 100644
index 0000000000000000000000000000000000000000..0dd15774bed00306ad8b674eda0010dadf9ed888
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.26 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.28 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.28
new file mode 100644
index 0000000000000000000000000000000000000000..86b28444c3ca259d34b58c1bc8f438537dab1915
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.28 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.30 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.30
new file mode 100644
index 0000000000000000000000000000000000000000..ae8cdb86b2cffdd3e8c4e72636b64199b6d99fd1
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.30 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.31 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.31
new file mode 100644
index 0000000000000000000000000000000000000000..e144e494186b4622669caa1771a0b0a75cd95b9f
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.31 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.32 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.32
new file mode 100644
index 0000000000000000000000000000000000000000..ed70410fd31bb6f5df40d46375aa05e7d1c1a617
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.32 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.34 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.34
new file mode 100644
index 0000000000000000000000000000000000000000..ad7f5d0e46202fbe53052de08b5faa31f0e47b04
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.34 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.36 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.36
new file mode 100644
index 0000000000000000000000000000000000000000..23fa1d00404675106de6891e13d1df5321b26368
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.36 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.38 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.38
new file mode 100644
index 0000000000000000000000000000000000000000..93d5a2a6f48a99d664295de86766fbcc3ccd41ff
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.38 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.4 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.4
new file mode 100644
index 0000000000000000000000000000000000000000..b8343d97e4a1d9588d3d8b443c0cff3cbb256484
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.4 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.40 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.40
new file mode 100644
index 0000000000000000000000000000000000000000..1f57bd39a3f8e02ff61fd4b50276290c096b2976
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.40 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.42 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.42
new file mode 100644
index 0000000000000000000000000000000000000000..3241259e31faccd7e840443c9cbb51f11f721485
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.42 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.44 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.44
new file mode 100644
index 0000000000000000000000000000000000000000..e90587792ac30601e065c82d88395eafbcf4a4f7
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.44 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.45 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.45
new file mode 100644
index 0000000000000000000000000000000000000000..a15e7e92e2dbbcdb167b430aa1e44d293af46f66
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.45 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.46 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.46
new file mode 100644
index 0000000000000000000000000000000000000000..666a8c491776064c5c7ee69b451c44fb685ed594
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.46 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.48 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.48
new file mode 100644
index 0000000000000000000000000000000000000000..4dc94df76899f914089676837a9e8b1bc17dcbfe
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.48 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.50 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.50
new file mode 100644
index 0000000000000000000000000000000000000000..4502f3e3e4895ff43654f7e2b7d1d9a7f9c3e624
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.50 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.6 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.6
new file mode 100644
index 0000000000000000000000000000000000000000..5d77f6ee07971b709e033e7d6f58f5fce486e6c7
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.6 differ
diff --git a/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.8 b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.8
new file mode 100644
index 0000000000000000000000000000000000000000..a97c02205169059b65f7ccababcfa36854ed2621
Binary files /dev/null and b/runs/May25_10-00-26_Beaver/events.out.tfevents.1621929626.Beaver.142246.8 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.0051198/events.out.tfevents.1621929627.Beaver.142246.52 b/runs/May25_10-00-27_Beaver/1621929627.0051198/events.out.tfevents.1621929627.Beaver.142246.52
new file mode 100644
index 0000000000000000000000000000000000000000..cb4479e7ad4059f0223c77ce531fafc27779055b
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.0051198/events.out.tfevents.1621929627.Beaver.142246.52 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.0357857/events.out.tfevents.1621929627.Beaver.142246.54 b/runs/May25_10-00-27_Beaver/1621929627.0357857/events.out.tfevents.1621929627.Beaver.142246.54
new file mode 100644
index 0000000000000000000000000000000000000000..125c9442fa874076c89cf681ab22bf86fe8ee69c
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.0357857/events.out.tfevents.1621929627.Beaver.142246.54 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.343746/events.out.tfevents.1621929627.Beaver.142246.56 b/runs/May25_10-00-27_Beaver/1621929627.343746/events.out.tfevents.1621929627.Beaver.142246.56
new file mode 100644
index 0000000000000000000000000000000000000000..4c6094b85b6f6232dc6cb53af02b67e6e16e3b7c
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.343746/events.out.tfevents.1621929627.Beaver.142246.56 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.375256/events.out.tfevents.1621929627.Beaver.142246.58 b/runs/May25_10-00-27_Beaver/1621929627.375256/events.out.tfevents.1621929627.Beaver.142246.58
new file mode 100644
index 0000000000000000000000000000000000000000..8272f01bdd546920a05144cb9359027065fe9b12
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.375256/events.out.tfevents.1621929627.Beaver.142246.58 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.4082663/events.out.tfevents.1621929627.Beaver.142246.60 b/runs/May25_10-00-27_Beaver/1621929627.4082663/events.out.tfevents.1621929627.Beaver.142246.60
new file mode 100644
index 0000000000000000000000000000000000000000..91981d0e32f6119385d7f2dde1b0bf81bcaa44b9
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.4082663/events.out.tfevents.1621929627.Beaver.142246.60 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.4390442/events.out.tfevents.1621929627.Beaver.142246.62 b/runs/May25_10-00-27_Beaver/1621929627.4390442/events.out.tfevents.1621929627.Beaver.142246.62
new file mode 100644
index 0000000000000000000000000000000000000000..c11b39aef34e5b38c6fcd51e342c972c87541b11
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.4390442/events.out.tfevents.1621929627.Beaver.142246.62 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.4708414/events.out.tfevents.1621929627.Beaver.142246.64 b/runs/May25_10-00-27_Beaver/1621929627.4708414/events.out.tfevents.1621929627.Beaver.142246.64
new file mode 100644
index 0000000000000000000000000000000000000000..c7b0ccad376f6d6bec8272c54628168043f4173a
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.4708414/events.out.tfevents.1621929627.Beaver.142246.64 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.800928/events.out.tfevents.1621929627.Beaver.142246.68 b/runs/May25_10-00-27_Beaver/1621929627.800928/events.out.tfevents.1621929627.Beaver.142246.68
new file mode 100644
index 0000000000000000000000000000000000000000..535720fb296134d30952e7a6430bac691a25f707
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.800928/events.out.tfevents.1621929627.Beaver.142246.68 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.8332822/events.out.tfevents.1621929627.Beaver.142246.70 b/runs/May25_10-00-27_Beaver/1621929627.8332822/events.out.tfevents.1621929627.Beaver.142246.70
new file mode 100644
index 0000000000000000000000000000000000000000..ae34ebc14d3e0d7a54094c7a9421ab88eccb321e
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.8332822/events.out.tfevents.1621929627.Beaver.142246.70 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.8644283/events.out.tfevents.1621929627.Beaver.142246.72 b/runs/May25_10-00-27_Beaver/1621929627.8644283/events.out.tfevents.1621929627.Beaver.142246.72
new file mode 100644
index 0000000000000000000000000000000000000000..c6e2b767e90416c061a7debe79d7000a87fc6d04
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.8644283/events.out.tfevents.1621929627.Beaver.142246.72 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.9068155/events.out.tfevents.1621929627.Beaver.142246.74 b/runs/May25_10-00-27_Beaver/1621929627.9068155/events.out.tfevents.1621929627.Beaver.142246.74
new file mode 100644
index 0000000000000000000000000000000000000000..202d941a42f601c41502581dd81fee9fd0983253
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.9068155/events.out.tfevents.1621929627.Beaver.142246.74 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.9389122/events.out.tfevents.1621929627.Beaver.142246.76 b/runs/May25_10-00-27_Beaver/1621929627.9389122/events.out.tfevents.1621929627.Beaver.142246.76
new file mode 100644
index 0000000000000000000000000000000000000000..d765cedeb44fbabafe2e23b930d3557a7607d857
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.9389122/events.out.tfevents.1621929627.Beaver.142246.76 differ
diff --git a/runs/May25_10-00-27_Beaver/1621929627.9700875/events.out.tfevents.1621929627.Beaver.142246.78 b/runs/May25_10-00-27_Beaver/1621929627.9700875/events.out.tfevents.1621929627.Beaver.142246.78
new file mode 100644
index 0000000000000000000000000000000000000000..d78b045dbfe97d2dd22c888791095201a3b4f966
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/1621929627.9700875/events.out.tfevents.1621929627.Beaver.142246.78 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.51 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.51
new file mode 100644
index 0000000000000000000000000000000000000000..c152a0c95fb77fb984134e4166495f29cbaed0cf
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.51 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.53 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.53
new file mode 100644
index 0000000000000000000000000000000000000000..797551a471693e0005d92319fb80e08e2b859ea3
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.53 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.55 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.55
new file mode 100644
index 0000000000000000000000000000000000000000..6842377ea70e496d2d0144124872e0b1706c1834
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.55 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.57 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.57
new file mode 100644
index 0000000000000000000000000000000000000000..84c5e1b39d82f27d8ae8aa17c95b39f32fb2fd1c
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.57 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.59 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.59
new file mode 100644
index 0000000000000000000000000000000000000000..8d14d2b9608a77853a6b49aa05ce0626f92cbd19
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.59 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.61 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.61
new file mode 100644
index 0000000000000000000000000000000000000000..068382853a2a26cea52a9da8a78363543b4024ea
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.61 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.63 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.63
new file mode 100644
index 0000000000000000000000000000000000000000..c39293428ae0347764c26c0a8379b6210e12953d
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.63 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.65 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.65
new file mode 100644
index 0000000000000000000000000000000000000000..467609c74b4176b121e088a1cb847cfd012e3092
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.65 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.66 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.66
new file mode 100644
index 0000000000000000000000000000000000000000..af02f33712caac5543887fee3612fde4fa07e9a6
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.66 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.67 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.67
new file mode 100644
index 0000000000000000000000000000000000000000..8ee350bb883e986bbd8a10ced6740dd1a9f9e4f1
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.67 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.69 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.69
new file mode 100644
index 0000000000000000000000000000000000000000..0a24955879a6cb15099dc79b0962fb16bd8ef3dd
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.69 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.71 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.71
new file mode 100644
index 0000000000000000000000000000000000000000..b832a1c27733f401c4164caed484bb3e06b4fc16
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.71 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.73 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.73
new file mode 100644
index 0000000000000000000000000000000000000000..647d021eb1f61b2d112079e141e163f9ca0c9d75
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.73 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.75 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.75
new file mode 100644
index 0000000000000000000000000000000000000000..061520b7d6bf2d4366f689a6ecd1c09e50f02d2c
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.75 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.77 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.77
new file mode 100644
index 0000000000000000000000000000000000000000..962e7088a49b40c9de371f31e0f1710f86d9bbd5
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929627.Beaver.142246.77 differ
diff --git a/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929628.Beaver.142246.79 b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929628.Beaver.142246.79
new file mode 100644
index 0000000000000000000000000000000000000000..98478c20a67e9ccd81187e5d9842f45deb5131ea
Binary files /dev/null and b/runs/May25_10-00-27_Beaver/events.out.tfevents.1621929628.Beaver.142246.79 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.0610754/events.out.tfevents.1621929628.Beaver.142246.81 b/runs/May25_10-00-28_Beaver/1621929628.0610754/events.out.tfevents.1621929628.Beaver.142246.81
new file mode 100644
index 0000000000000000000000000000000000000000..cdf7b27d1734729a7ad18ae3c8a27c0b827c9675
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.0610754/events.out.tfevents.1621929628.Beaver.142246.81 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.1517868/events.out.tfevents.1621929628.Beaver.142246.84 b/runs/May25_10-00-28_Beaver/1621929628.1517868/events.out.tfevents.1621929628.Beaver.142246.84
new file mode 100644
index 0000000000000000000000000000000000000000..489975ef0d507929fb79fd49e4d1bf709719ea79
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.1517868/events.out.tfevents.1621929628.Beaver.142246.84 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.2332213/events.out.tfevents.1621929628.Beaver.142246.87 b/runs/May25_10-00-28_Beaver/1621929628.2332213/events.out.tfevents.1621929628.Beaver.142246.87
new file mode 100644
index 0000000000000000000000000000000000000000..082d9333d0fdbbed4faa37d6ba4578438712e473
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.2332213/events.out.tfevents.1621929628.Beaver.142246.87 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.3112667/events.out.tfevents.1621929628.Beaver.142246.90 b/runs/May25_10-00-28_Beaver/1621929628.3112667/events.out.tfevents.1621929628.Beaver.142246.90
new file mode 100644
index 0000000000000000000000000000000000000000..66fa4ecf4d13139f901e09b961a444e798e47e8a
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.3112667/events.out.tfevents.1621929628.Beaver.142246.90 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.3437808/events.out.tfevents.1621929628.Beaver.142246.92 b/runs/May25_10-00-28_Beaver/1621929628.3437808/events.out.tfevents.1621929628.Beaver.142246.92
new file mode 100644
index 0000000000000000000000000000000000000000..0be8cd8e9d42eefcc86ccdef0908c6f15e7b2a7f
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.3437808/events.out.tfevents.1621929628.Beaver.142246.92 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.55281/events.out.tfevents.1621929628.Beaver.142246.94 b/runs/May25_10-00-28_Beaver/1621929628.55281/events.out.tfevents.1621929628.Beaver.142246.94
new file mode 100644
index 0000000000000000000000000000000000000000..d1f87a360ea1f357ef8836f5865026b8480d1f7a
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.55281/events.out.tfevents.1621929628.Beaver.142246.94 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.8906076/events.out.tfevents.1621929628.Beaver.142246.97 b/runs/May25_10-00-28_Beaver/1621929628.8906076/events.out.tfevents.1621929628.Beaver.142246.97
new file mode 100644
index 0000000000000000000000000000000000000000..e4644cd3e85f22a59e89924e856fd3ef13e56a87
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.8906076/events.out.tfevents.1621929628.Beaver.142246.97 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.9347723/events.out.tfevents.1621929628.Beaver.142246.100 b/runs/May25_10-00-28_Beaver/1621929628.9347723/events.out.tfevents.1621929628.Beaver.142246.100
new file mode 100644
index 0000000000000000000000000000000000000000..bfa7b69e5b798404a65c619ceaa1e6f2b4c7f8d2
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.9347723/events.out.tfevents.1621929628.Beaver.142246.100 differ
diff --git a/runs/May25_10-00-28_Beaver/1621929628.9711795/events.out.tfevents.1621929628.Beaver.142246.102 b/runs/May25_10-00-28_Beaver/1621929628.9711795/events.out.tfevents.1621929628.Beaver.142246.102
new file mode 100644
index 0000000000000000000000000000000000000000..39b950ab47478a33c724a8c9d0a018dba0352f6f
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/1621929628.9711795/events.out.tfevents.1621929628.Beaver.142246.102 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.101 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.101
new file mode 100644
index 0000000000000000000000000000000000000000..bb74d56a18d60aaa11e920c21343a722e66bf83a
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.101 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.80 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.80
new file mode 100644
index 0000000000000000000000000000000000000000..4505b48d37be1b705fe5404088c835c255f4394b
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.80 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.82 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.82
new file mode 100644
index 0000000000000000000000000000000000000000..7649a479ce355b7041b57482cf488dc7a15310ce
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.82 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.83 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.83
new file mode 100644
index 0000000000000000000000000000000000000000..9d2c6a9308e7fd07ff8bab2766debe69f822bbfc
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.83 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.85 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.85
new file mode 100644
index 0000000000000000000000000000000000000000..50833387f70eac351d71757ef34699fc265c7339
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.85 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.86 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.86
new file mode 100644
index 0000000000000000000000000000000000000000..4bb9fbc2853665168d991c9a3f0d1063346eaf7a
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.86 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.88 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.88
new file mode 100644
index 0000000000000000000000000000000000000000..bcdc975488f3425be2062cee5cde21465f0150fe
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.88 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.89 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.89
new file mode 100644
index 0000000000000000000000000000000000000000..289e65896cf9371d73c257b50ad10ef98c18cee3
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.89 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.91 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.91
new file mode 100644
index 0000000000000000000000000000000000000000..7286026c1ab634691e3c73fa73981a1ffd7c7e93
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.91 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.93 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.93
new file mode 100644
index 0000000000000000000000000000000000000000..e2a78887d0f75f031770dc0c8c02bce6ef5dd0a0
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.93 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.95 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.95
new file mode 100644
index 0000000000000000000000000000000000000000..57b61e63b1e62675b60286225d7cebc708c2aeda
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.95 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.96 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.96
new file mode 100644
index 0000000000000000000000000000000000000000..e12707d4ce7ebcbb7d21bda031f2bda29f020a40
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.96 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.98 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.98
new file mode 100644
index 0000000000000000000000000000000000000000..5e3ba65fae3a8d9d8e251cd7d56b23db3ccebd15
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.98 differ
diff --git a/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.99 b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.99
new file mode 100644
index 0000000000000000000000000000000000000000..aca7887e2edeceb276edef22eb90da29a2d4b784
Binary files /dev/null and b/runs/May25_10-00-28_Beaver/events.out.tfevents.1621929628.Beaver.142246.99 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.0058544/events.out.tfevents.1621929629.Beaver.142246.104 b/runs/May25_10-00-29_Beaver/1621929629.0058544/events.out.tfevents.1621929629.Beaver.142246.104
new file mode 100644
index 0000000000000000000000000000000000000000..a45da56adfec293d86bb5c5d15d64efc7148db75
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.0058544/events.out.tfevents.1621929629.Beaver.142246.104 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.0357509/events.out.tfevents.1621929629.Beaver.142246.106 b/runs/May25_10-00-29_Beaver/1621929629.0357509/events.out.tfevents.1621929629.Beaver.142246.106
new file mode 100644
index 0000000000000000000000000000000000000000..bbbf0a7d827e6127d1a6c9a0a0c7d96944d65892
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.0357509/events.out.tfevents.1621929629.Beaver.142246.106 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.0653656/events.out.tfevents.1621929629.Beaver.142246.108 b/runs/May25_10-00-29_Beaver/1621929629.0653656/events.out.tfevents.1621929629.Beaver.142246.108
new file mode 100644
index 0000000000000000000000000000000000000000..c71b31453dd6b64a4f087fdb4142010f04546632
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.0653656/events.out.tfevents.1621929629.Beaver.142246.108 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.0977867/events.out.tfevents.1621929629.Beaver.142246.110 b/runs/May25_10-00-29_Beaver/1621929629.0977867/events.out.tfevents.1621929629.Beaver.142246.110
new file mode 100644
index 0000000000000000000000000000000000000000..99e621814b6e6c1fba85597ffd58f586b6e0e353
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.0977867/events.out.tfevents.1621929629.Beaver.142246.110 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.1301053/events.out.tfevents.1621929629.Beaver.142246.112 b/runs/May25_10-00-29_Beaver/1621929629.1301053/events.out.tfevents.1621929629.Beaver.142246.112
new file mode 100644
index 0000000000000000000000000000000000000000..b82219897d6b2704e09b93f2de840b53914f0b8f
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.1301053/events.out.tfevents.1621929629.Beaver.142246.112 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.1655993/events.out.tfevents.1621929629.Beaver.142246.114 b/runs/May25_10-00-29_Beaver/1621929629.1655993/events.out.tfevents.1621929629.Beaver.142246.114
new file mode 100644
index 0000000000000000000000000000000000000000..dd853371c6ca6257d113cbf136cf93f6861cb46b
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.1655993/events.out.tfevents.1621929629.Beaver.142246.114 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.1982548/events.out.tfevents.1621929629.Beaver.142246.116 b/runs/May25_10-00-29_Beaver/1621929629.1982548/events.out.tfevents.1621929629.Beaver.142246.116
new file mode 100644
index 0000000000000000000000000000000000000000..e67bb12f6c832b42b4997b8beeffed5a1d4ec893
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.1982548/events.out.tfevents.1621929629.Beaver.142246.116 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.2312965/events.out.tfevents.1621929629.Beaver.142246.118 b/runs/May25_10-00-29_Beaver/1621929629.2312965/events.out.tfevents.1621929629.Beaver.142246.118
new file mode 100644
index 0000000000000000000000000000000000000000..380f979aa3b5a6c5374e7fc2c180d4655d4e0c9a
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.2312965/events.out.tfevents.1621929629.Beaver.142246.118 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.2464006/events.out.tfevents.1621929629.Beaver.142246.120 b/runs/May25_10-00-29_Beaver/1621929629.2464006/events.out.tfevents.1621929629.Beaver.142246.120
new file mode 100644
index 0000000000000000000000000000000000000000..b1eab201e085ca834d267c75f95e8c6e568c81d7
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.2464006/events.out.tfevents.1621929629.Beaver.142246.120 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.2620585/events.out.tfevents.1621929629.Beaver.142246.122 b/runs/May25_10-00-29_Beaver/1621929629.2620585/events.out.tfevents.1621929629.Beaver.142246.122
new file mode 100644
index 0000000000000000000000000000000000000000..d7c1cce66663e4ee1675e7b3465079c06f2c6e1b
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.2620585/events.out.tfevents.1621929629.Beaver.142246.122 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.2938466/events.out.tfevents.1621929629.Beaver.142246.124 b/runs/May25_10-00-29_Beaver/1621929629.2938466/events.out.tfevents.1621929629.Beaver.142246.124
new file mode 100644
index 0000000000000000000000000000000000000000..ee963c6a3c6de0f50da19775c92ee6926090b39e
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.2938466/events.out.tfevents.1621929629.Beaver.142246.124 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.3254178/events.out.tfevents.1621929629.Beaver.142246.126 b/runs/May25_10-00-29_Beaver/1621929629.3254178/events.out.tfevents.1621929629.Beaver.142246.126
new file mode 100644
index 0000000000000000000000000000000000000000..aedced26c412aa12e6a809c2029be21f281630b4
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.3254178/events.out.tfevents.1621929629.Beaver.142246.126 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.3548/events.out.tfevents.1621929629.Beaver.142246.128 b/runs/May25_10-00-29_Beaver/1621929629.3548/events.out.tfevents.1621929629.Beaver.142246.128
new file mode 100644
index 0000000000000000000000000000000000000000..2fa2639f1ad38a31b4ad227674bf533a4707d6b0
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.3548/events.out.tfevents.1621929629.Beaver.142246.128 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.3733013/events.out.tfevents.1621929629.Beaver.142246.130 b/runs/May25_10-00-29_Beaver/1621929629.3733013/events.out.tfevents.1621929629.Beaver.142246.130
new file mode 100644
index 0000000000000000000000000000000000000000..e252f8f969f5de8f5bc6eca8561bcda72a9020e2
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.3733013/events.out.tfevents.1621929629.Beaver.142246.130 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.3905444/events.out.tfevents.1621929629.Beaver.142246.132 b/runs/May25_10-00-29_Beaver/1621929629.3905444/events.out.tfevents.1621929629.Beaver.142246.132
new file mode 100644
index 0000000000000000000000000000000000000000..b1eadd12f6d9f323931bb0dbc5a6b3daf9016707
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.3905444/events.out.tfevents.1621929629.Beaver.142246.132 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.420209/events.out.tfevents.1621929629.Beaver.142246.134 b/runs/May25_10-00-29_Beaver/1621929629.420209/events.out.tfevents.1621929629.Beaver.142246.134
new file mode 100644
index 0000000000000000000000000000000000000000..567501d830720c93a08dbf936be8f50d31c9ad82
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.420209/events.out.tfevents.1621929629.Beaver.142246.134 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.486138/events.out.tfevents.1621929629.Beaver.142246.136 b/runs/May25_10-00-29_Beaver/1621929629.486138/events.out.tfevents.1621929629.Beaver.142246.136
new file mode 100644
index 0000000000000000000000000000000000000000..56014caefbfa8e5984b358204eed1333f435509f
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.486138/events.out.tfevents.1621929629.Beaver.142246.136 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.5189574/events.out.tfevents.1621929629.Beaver.142246.138 b/runs/May25_10-00-29_Beaver/1621929629.5189574/events.out.tfevents.1621929629.Beaver.142246.138
new file mode 100644
index 0000000000000000000000000000000000000000..3e271fc2a4501c0a0fcd18d146abc19f56c65b83
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.5189574/events.out.tfevents.1621929629.Beaver.142246.138 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.5649562/events.out.tfevents.1621929629.Beaver.142246.140 b/runs/May25_10-00-29_Beaver/1621929629.5649562/events.out.tfevents.1621929629.Beaver.142246.140
new file mode 100644
index 0000000000000000000000000000000000000000..d9cb6d8815e8e66e4a2b37399c24614c7bf137d9
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.5649562/events.out.tfevents.1621929629.Beaver.142246.140 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.5960956/events.out.tfevents.1621929629.Beaver.142246.142 b/runs/May25_10-00-29_Beaver/1621929629.5960956/events.out.tfevents.1621929629.Beaver.142246.142
new file mode 100644
index 0000000000000000000000000000000000000000..90c0bc30b698ec3d25f1a6e29f4ebb3156d0a7b6
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.5960956/events.out.tfevents.1621929629.Beaver.142246.142 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.6296866/events.out.tfevents.1621929629.Beaver.142246.144 b/runs/May25_10-00-29_Beaver/1621929629.6296866/events.out.tfevents.1621929629.Beaver.142246.144
new file mode 100644
index 0000000000000000000000000000000000000000..a11e45bc2cb29ebfd59514c080044bdb9c17d3bd
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.6296866/events.out.tfevents.1621929629.Beaver.142246.144 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.6615765/events.out.tfevents.1621929629.Beaver.142246.146 b/runs/May25_10-00-29_Beaver/1621929629.6615765/events.out.tfevents.1621929629.Beaver.142246.146
new file mode 100644
index 0000000000000000000000000000000000000000..8ab2acb8c6874a593f65a0dbdd6713be5632b3b3
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.6615765/events.out.tfevents.1621929629.Beaver.142246.146 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.6964898/events.out.tfevents.1621929629.Beaver.142246.148 b/runs/May25_10-00-29_Beaver/1621929629.6964898/events.out.tfevents.1621929629.Beaver.142246.148
new file mode 100644
index 0000000000000000000000000000000000000000..1233e94e4898c413c2025d3808375bf04606bda5
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.6964898/events.out.tfevents.1621929629.Beaver.142246.148 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.7286127/events.out.tfevents.1621929629.Beaver.142246.150 b/runs/May25_10-00-29_Beaver/1621929629.7286127/events.out.tfevents.1621929629.Beaver.142246.150
new file mode 100644
index 0000000000000000000000000000000000000000..d0b93c33c8c7f9cad994e2658812b1a0d9728ae4
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.7286127/events.out.tfevents.1621929629.Beaver.142246.150 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.7610254/events.out.tfevents.1621929629.Beaver.142246.152 b/runs/May25_10-00-29_Beaver/1621929629.7610254/events.out.tfevents.1621929629.Beaver.142246.152
new file mode 100644
index 0000000000000000000000000000000000000000..2ca5677481423df7f7a12a258a89588e9a57c28d
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.7610254/events.out.tfevents.1621929629.Beaver.142246.152 differ
diff --git a/runs/May25_10-00-29_Beaver/1621929629.9056401/events.out.tfevents.1621929629.Beaver.142246.154 b/runs/May25_10-00-29_Beaver/1621929629.9056401/events.out.tfevents.1621929629.Beaver.142246.154
new file mode 100644
index 0000000000000000000000000000000000000000..ec4495dba2065299d725c112e2bb92625f2ec0d4
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/1621929629.9056401/events.out.tfevents.1621929629.Beaver.142246.154 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.103 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.103
new file mode 100644
index 0000000000000000000000000000000000000000..c1da247a568573fe865fa1c2cbd63ed4bd4b7b84
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.103 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.105 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.105
new file mode 100644
index 0000000000000000000000000000000000000000..667ada806fde3de6bdb89913dd0b481e637a4427
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.105 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.107 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.107
new file mode 100644
index 0000000000000000000000000000000000000000..85880fecf6b8bcf36bb24237913e06c80dfe794e
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.107 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.109 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.109
new file mode 100644
index 0000000000000000000000000000000000000000..c40b7c046d3927c6f205420e214e31a3afe9c6f3
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.109 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.111 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.111
new file mode 100644
index 0000000000000000000000000000000000000000..8ee38114a40d75c360ee3f27a13b234bf4177a78
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.111 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.113 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.113
new file mode 100644
index 0000000000000000000000000000000000000000..e7150cb2dd65d7f7f53db5b84317e976946b6e72
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.113 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.115 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.115
new file mode 100644
index 0000000000000000000000000000000000000000..57bcb63ca4ed5861816ea3cdf99c0b0424f8145f
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.115 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.117 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.117
new file mode 100644
index 0000000000000000000000000000000000000000..63bb172a2623063ab5e52f954313d981d86892c8
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.117 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.119 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.119
new file mode 100644
index 0000000000000000000000000000000000000000..0cb0b435c578e25ec0ab874013ca00c82b33d60f
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.119 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.121 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.121
new file mode 100644
index 0000000000000000000000000000000000000000..75dcd21391ab17b67f743297ecc70339ab1628f9
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.121 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.123 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.123
new file mode 100644
index 0000000000000000000000000000000000000000..5529969a9337fb6b6ea105d66af2022ccc194ba8
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.123 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.125 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.125
new file mode 100644
index 0000000000000000000000000000000000000000..dd2b28b1711b1679d17fa8bd79e92579760814da
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.125 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.127 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.127
new file mode 100644
index 0000000000000000000000000000000000000000..f94f2f28552af9e479805ea1e1393dc5e7c76aa0
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.127 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.129 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.129
new file mode 100644
index 0000000000000000000000000000000000000000..8a2b3ca2dda0f7b3f1d527824caef1fabe24f7be
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.129 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.131 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.131
new file mode 100644
index 0000000000000000000000000000000000000000..415321ab038c302d6543da263fb4fe97e7e81ccb
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.131 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.133 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.133
new file mode 100644
index 0000000000000000000000000000000000000000..7b5e8533a3c3221fd11c7207198e9af5817e7ac9
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.133 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.135 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.135
new file mode 100644
index 0000000000000000000000000000000000000000..674687c5c25af4d7c6e72bd61564162e38ad1396
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.135 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.137 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.137
new file mode 100644
index 0000000000000000000000000000000000000000..2327458686d1cf705ca2a69e71ee88ca36c7adfc
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.137 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.139 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.139
new file mode 100644
index 0000000000000000000000000000000000000000..ab4815c822c2e15b578aac38692883fa2cbfb8ce
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.139 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.141 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.141
new file mode 100644
index 0000000000000000000000000000000000000000..6c807ad2385f1c6bbb3501ad2c77a192a407ab3b
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.141 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.143 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.143
new file mode 100644
index 0000000000000000000000000000000000000000..fa18512fe0c51f0b681b989b55d445a707f4b5bb
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.143 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.145 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.145
new file mode 100644
index 0000000000000000000000000000000000000000..8238d64e394ba911753873abae1df77f56a616cb
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.145 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.147 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.147
new file mode 100644
index 0000000000000000000000000000000000000000..37456d41aea53cb442e5d79256a43d4a5ad9838e
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.147 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.149 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.149
new file mode 100644
index 0000000000000000000000000000000000000000..82b6ee656069abcd0d52bbfdd656adc06403a3fd
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.149 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.151 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.151
new file mode 100644
index 0000000000000000000000000000000000000000..c3e4b5b97402f131274fd33cdbb189043f103592
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.151 differ
diff --git a/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.153 b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.153
new file mode 100644
index 0000000000000000000000000000000000000000..afa4a24c230ba9872dd50dbc2ffc6126c31e3cff
Binary files /dev/null and b/runs/May25_10-00-29_Beaver/events.out.tfevents.1621929629.Beaver.142246.153 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.0479634/events.out.tfevents.1621929630.Beaver.142246.156 b/runs/May25_10-00-30_Beaver/1621929630.0479634/events.out.tfevents.1621929630.Beaver.142246.156
new file mode 100644
index 0000000000000000000000000000000000000000..ffedef3eb5248c40422533ff7700bef4d3db2bd1
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.0479634/events.out.tfevents.1621929630.Beaver.142246.156 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.0811975/events.out.tfevents.1621929630.Beaver.142246.158 b/runs/May25_10-00-30_Beaver/1621929630.0811975/events.out.tfevents.1621929630.Beaver.142246.158
new file mode 100644
index 0000000000000000000000000000000000000000..d8008b1ab8d08793960e98e3708c21f0f673cce0
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.0811975/events.out.tfevents.1621929630.Beaver.142246.158 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.1143088/events.out.tfevents.1621929630.Beaver.142246.160 b/runs/May25_10-00-30_Beaver/1621929630.1143088/events.out.tfevents.1621929630.Beaver.142246.160
new file mode 100644
index 0000000000000000000000000000000000000000..ee5f6076bdce36afe3b60173957d5a54ae5eace3
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.1143088/events.out.tfevents.1621929630.Beaver.142246.160 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.2127478/events.out.tfevents.1621929630.Beaver.142246.162 b/runs/May25_10-00-30_Beaver/1621929630.2127478/events.out.tfevents.1621929630.Beaver.142246.162
new file mode 100644
index 0000000000000000000000000000000000000000..55f604f2b9d5cd13da54cd2d14f632069ed53a3d
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.2127478/events.out.tfevents.1621929630.Beaver.142246.162 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.3048697/events.out.tfevents.1621929630.Beaver.142246.164 b/runs/May25_10-00-30_Beaver/1621929630.3048697/events.out.tfevents.1621929630.Beaver.142246.164
new file mode 100644
index 0000000000000000000000000000000000000000..92d1c8f62bbe89a8a8b289ec2fab69558addcafd
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.3048697/events.out.tfevents.1621929630.Beaver.142246.164 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.3386166/events.out.tfevents.1621929630.Beaver.142246.166 b/runs/May25_10-00-30_Beaver/1621929630.3386166/events.out.tfevents.1621929630.Beaver.142246.166
new file mode 100644
index 0000000000000000000000000000000000000000..3e35d9e8a3aa62a72ad48f5649e2d2d287e01d93
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.3386166/events.out.tfevents.1621929630.Beaver.142246.166 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.3693666/events.out.tfevents.1621929630.Beaver.142246.168 b/runs/May25_10-00-30_Beaver/1621929630.3693666/events.out.tfevents.1621929630.Beaver.142246.168
new file mode 100644
index 0000000000000000000000000000000000000000..698cb6103965e0eb15477018eae224987f5e08ac
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.3693666/events.out.tfevents.1621929630.Beaver.142246.168 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.450536/events.out.tfevents.1621929630.Beaver.142246.170 b/runs/May25_10-00-30_Beaver/1621929630.450536/events.out.tfevents.1621929630.Beaver.142246.170
new file mode 100644
index 0000000000000000000000000000000000000000..d58123ccc493358f9bd176822a009e011f8a379e
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.450536/events.out.tfevents.1621929630.Beaver.142246.170 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.5090072/events.out.tfevents.1621929630.Beaver.142246.172 b/runs/May25_10-00-30_Beaver/1621929630.5090072/events.out.tfevents.1621929630.Beaver.142246.172
new file mode 100644
index 0000000000000000000000000000000000000000..5098b0b44f40a279d209e652fd96fd2eccab1407
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.5090072/events.out.tfevents.1621929630.Beaver.142246.172 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.5389555/events.out.tfevents.1621929630.Beaver.142246.174 b/runs/May25_10-00-30_Beaver/1621929630.5389555/events.out.tfevents.1621929630.Beaver.142246.174
new file mode 100644
index 0000000000000000000000000000000000000000..af8f4bf2e427ebb6f9cf1286ef6a8d7998347dc2
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.5389555/events.out.tfevents.1621929630.Beaver.142246.174 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.5731533/events.out.tfevents.1621929630.Beaver.142246.176 b/runs/May25_10-00-30_Beaver/1621929630.5731533/events.out.tfevents.1621929630.Beaver.142246.176
new file mode 100644
index 0000000000000000000000000000000000000000..aa9be90a4f64b0dc0cea79e2301250910ecdb144
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.5731533/events.out.tfevents.1621929630.Beaver.142246.176 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.6146882/events.out.tfevents.1621929630.Beaver.142246.178 b/runs/May25_10-00-30_Beaver/1621929630.6146882/events.out.tfevents.1621929630.Beaver.142246.178
new file mode 100644
index 0000000000000000000000000000000000000000..f3c8e93be70c7e1d643703b826219518ade02919
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.6146882/events.out.tfevents.1621929630.Beaver.142246.178 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.6563115/events.out.tfevents.1621929630.Beaver.142246.180 b/runs/May25_10-00-30_Beaver/1621929630.6563115/events.out.tfevents.1621929630.Beaver.142246.180
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c01a827e28d7f378f48e39363f0af316e5d7f
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.6563115/events.out.tfevents.1621929630.Beaver.142246.180 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.6881378/events.out.tfevents.1621929630.Beaver.142246.182 b/runs/May25_10-00-30_Beaver/1621929630.6881378/events.out.tfevents.1621929630.Beaver.142246.182
new file mode 100644
index 0000000000000000000000000000000000000000..af2c2198d10b3cc084e253b30cd897a9f95de5b1
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.6881378/events.out.tfevents.1621929630.Beaver.142246.182 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.7252176/events.out.tfevents.1621929630.Beaver.142246.184 b/runs/May25_10-00-30_Beaver/1621929630.7252176/events.out.tfevents.1621929630.Beaver.142246.184
new file mode 100644
index 0000000000000000000000000000000000000000..ccd7982b390a8bfcfe387f3c3e22913f6bc25d6c
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.7252176/events.out.tfevents.1621929630.Beaver.142246.184 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.7566783/events.out.tfevents.1621929630.Beaver.142246.186 b/runs/May25_10-00-30_Beaver/1621929630.7566783/events.out.tfevents.1621929630.Beaver.142246.186
new file mode 100644
index 0000000000000000000000000000000000000000..c1835ff44a38d19d4512545ae178b7e8f65ecdbc
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.7566783/events.out.tfevents.1621929630.Beaver.142246.186 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.7898812/events.out.tfevents.1621929630.Beaver.142246.188 b/runs/May25_10-00-30_Beaver/1621929630.7898812/events.out.tfevents.1621929630.Beaver.142246.188
new file mode 100644
index 0000000000000000000000000000000000000000..04da4e7a7902ab07d3a3d1ee4232403c73bbc131
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.7898812/events.out.tfevents.1621929630.Beaver.142246.188 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929630.8268893/events.out.tfevents.1621929630.Beaver.142246.190 b/runs/May25_10-00-30_Beaver/1621929630.8268893/events.out.tfevents.1621929630.Beaver.142246.190
new file mode 100644
index 0000000000000000000000000000000000000000..4394f7355917cf957ff84c0599d041b5c75d837d
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929630.8268893/events.out.tfevents.1621929630.Beaver.142246.190 differ
diff --git a/runs/May25_10-00-30_Beaver/1621929631.00648/events.out.tfevents.1621929631.Beaver.142246.192 b/runs/May25_10-00-30_Beaver/1621929631.00648/events.out.tfevents.1621929631.Beaver.142246.192
new file mode 100644
index 0000000000000000000000000000000000000000..b64ae6567e760b39fd2f6736da4c4df90eade818
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/1621929631.00648/events.out.tfevents.1621929631.Beaver.142246.192 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.155 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.155
new file mode 100644
index 0000000000000000000000000000000000000000..b0c3c1a44911d1f190413c30486dd3787dc0fd99
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.155 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.157 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.157
new file mode 100644
index 0000000000000000000000000000000000000000..f1b65e716e9910ad7fc7dd95bfae403f69b01858
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.157 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.159 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.159
new file mode 100644
index 0000000000000000000000000000000000000000..057f73772438be9495f8cfb06d096fa1116337ed
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.159 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.161 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.161
new file mode 100644
index 0000000000000000000000000000000000000000..5d3fc30d45a9110339df2eae0c875e6022cda916
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.161 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.163 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.163
new file mode 100644
index 0000000000000000000000000000000000000000..10f0e1b13bc274f5198c2595b20be20721749db7
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.163 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.165 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.165
new file mode 100644
index 0000000000000000000000000000000000000000..0261a3eb2121a58272c8b4aad14a0055d373ca6b
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.165 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.167 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.167
new file mode 100644
index 0000000000000000000000000000000000000000..ca0f8bfa2a942a9e2043b9ff1bf7091b5a448779
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.167 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.169 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.169
new file mode 100644
index 0000000000000000000000000000000000000000..5bef637a50db1087d626b468a54143bfbdd3780f
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.169 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.171 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.171
new file mode 100644
index 0000000000000000000000000000000000000000..8294e2495f129193ea57da10b776250dfac8b62b
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.171 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.173 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.173
new file mode 100644
index 0000000000000000000000000000000000000000..018716f6c50d3d59f718e13a64b41b296de910bd
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.173 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.175 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.175
new file mode 100644
index 0000000000000000000000000000000000000000..f6cec6d6c25af8949ee70775ca71f361791983c4
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.175 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.177 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.177
new file mode 100644
index 0000000000000000000000000000000000000000..3c70a637ab68a8efc73a16dc61c63172f1dacb02
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.177 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.179 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.179
new file mode 100644
index 0000000000000000000000000000000000000000..606a5ca052f75fb17d5b4f4370f04204f5148457
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.179 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.181 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.181
new file mode 100644
index 0000000000000000000000000000000000000000..165bacaffeef17f540b8be1815d0ec92f1b44428
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.181 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.183 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.183
new file mode 100644
index 0000000000000000000000000000000000000000..0b918e1147c901e7a7fb6411a0d6991161a46995
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.183 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.185 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.185
new file mode 100644
index 0000000000000000000000000000000000000000..8a4bb2ad6b7fc8f2ff8524960a26e847c385415a
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.185 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.187 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.187
new file mode 100644
index 0000000000000000000000000000000000000000..9a5483d4ecf74f0332fc397887037ef7ea1d37b9
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.187 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.189 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.189
new file mode 100644
index 0000000000000000000000000000000000000000..57328f270f9fbd887f2d0e16ccdc1c4744dc3ac4
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929630.Beaver.142246.189 differ
diff --git a/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929631.Beaver.142246.191 b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929631.Beaver.142246.191
new file mode 100644
index 0000000000000000000000000000000000000000..540925c7e81d1eca4000c6be0ed4b4d4dfa23cc3
Binary files /dev/null and b/runs/May25_10-00-30_Beaver/events.out.tfevents.1621929631.Beaver.142246.191 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.04029/events.out.tfevents.1621929631.Beaver.142246.194 b/runs/May25_10-00-31_Beaver/1621929631.04029/events.out.tfevents.1621929631.Beaver.142246.194
new file mode 100644
index 0000000000000000000000000000000000000000..420b5ee7a4539680931e0e5c333e2bac788e3b41
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.04029/events.out.tfevents.1621929631.Beaver.142246.194 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.0724509/events.out.tfevents.1621929631.Beaver.142246.196 b/runs/May25_10-00-31_Beaver/1621929631.0724509/events.out.tfevents.1621929631.Beaver.142246.196
new file mode 100644
index 0000000000000000000000000000000000000000..1da332dfb7fdaad5ddb74bbb6bd520937357c8b3
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.0724509/events.out.tfevents.1621929631.Beaver.142246.196 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.104547/events.out.tfevents.1621929631.Beaver.142246.198 b/runs/May25_10-00-31_Beaver/1621929631.104547/events.out.tfevents.1621929631.Beaver.142246.198
new file mode 100644
index 0000000000000000000000000000000000000000..ee927ee497c45f81f8bf5a4cf06bc8c12e4106c0
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.104547/events.out.tfevents.1621929631.Beaver.142246.198 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.146896/events.out.tfevents.1621929631.Beaver.142246.201 b/runs/May25_10-00-31_Beaver/1621929631.146896/events.out.tfevents.1621929631.Beaver.142246.201
new file mode 100644
index 0000000000000000000000000000000000000000..353ae55c632e8dccc200f5d9277aa3801393399a
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.146896/events.out.tfevents.1621929631.Beaver.142246.201 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.1790407/events.out.tfevents.1621929631.Beaver.142246.203 b/runs/May25_10-00-31_Beaver/1621929631.1790407/events.out.tfevents.1621929631.Beaver.142246.203
new file mode 100644
index 0000000000000000000000000000000000000000..5111e6e71201cee35e6c61c9d4b2923e3c914b58
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.1790407/events.out.tfevents.1621929631.Beaver.142246.203 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.2113008/events.out.tfevents.1621929631.Beaver.142246.205 b/runs/May25_10-00-31_Beaver/1621929631.2113008/events.out.tfevents.1621929631.Beaver.142246.205
new file mode 100644
index 0000000000000000000000000000000000000000..199575a103455cef23bbaab23992686d024e80c1
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.2113008/events.out.tfevents.1621929631.Beaver.142246.205 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.2421656/events.out.tfevents.1621929631.Beaver.142246.207 b/runs/May25_10-00-31_Beaver/1621929631.2421656/events.out.tfevents.1621929631.Beaver.142246.207
new file mode 100644
index 0000000000000000000000000000000000000000..8674c1f4c55e2eca7bb253f639a6ef2c2710845b
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.2421656/events.out.tfevents.1621929631.Beaver.142246.207 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.2736862/events.out.tfevents.1621929631.Beaver.142246.209 b/runs/May25_10-00-31_Beaver/1621929631.2736862/events.out.tfevents.1621929631.Beaver.142246.209
new file mode 100644
index 0000000000000000000000000000000000000000..e11dd32320da2ac2d767d8126d77a21f77e485d2
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.2736862/events.out.tfevents.1621929631.Beaver.142246.209 differ
diff --git a/runs/May25_10-00-31_Beaver/1621929631.3026946/events.out.tfevents.1621929631.Beaver.142246.211 b/runs/May25_10-00-31_Beaver/1621929631.3026946/events.out.tfevents.1621929631.Beaver.142246.211
new file mode 100644
index 0000000000000000000000000000000000000000..81693c0625c087b9e86f1c84ef03f44de006db61
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/1621929631.3026946/events.out.tfevents.1621929631.Beaver.142246.211 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.193 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.193
new file mode 100644
index 0000000000000000000000000000000000000000..4e51f209fe0d1322867fcb24522d4bca45fa4662
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.193 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.195 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.195
new file mode 100644
index 0000000000000000000000000000000000000000..cb720154e17cb8a9f95e5b9d0c6f72321f755536
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.195 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.197 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.197
new file mode 100644
index 0000000000000000000000000000000000000000..b69e7108ec2deeb8583917208b9d9e1804103ff3
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.197 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.199 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.199
new file mode 100644
index 0000000000000000000000000000000000000000..ceb266d5c5a3e5f3377d45db8884448e7bc98a0e
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.199 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.200 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.200
new file mode 100644
index 0000000000000000000000000000000000000000..be2e495913a808a6382406d6747a29b946dce327
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.200 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.202 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.202
new file mode 100644
index 0000000000000000000000000000000000000000..0e9c68e019f86c2d578362fa5d55a57d17466f4d
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.202 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.204 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.204
new file mode 100644
index 0000000000000000000000000000000000000000..4b5502b1370cc648d739a9b7b84393ca56ebe8dd
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.204 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.206 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.206
new file mode 100644
index 0000000000000000000000000000000000000000..b5b5917d7ea80c9378a11933402ee644b5394698
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.206 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.208 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.208
new file mode 100644
index 0000000000000000000000000000000000000000..5aba9b813051afe51f13c2a42575088832901b05
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.208 differ
diff --git a/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.210 b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.210
new file mode 100644
index 0000000000000000000000000000000000000000..01ab6cee59248de093f8cc231475f62b76131c67
Binary files /dev/null and b/runs/May25_10-00-31_Beaver/events.out.tfevents.1621929631.Beaver.142246.210 differ
diff --git a/runs/May25_10-00-51_Beaver/1621929653.6101258/events.out.tfevents.1621929653.Beaver.142785.1 b/runs/May25_10-00-51_Beaver/1621929653.6101258/events.out.tfevents.1621929653.Beaver.142785.1
new file mode 100644
index 0000000000000000000000000000000000000000..89ec94eb17a9fa686926071409ade78568ecb633
Binary files /dev/null and b/runs/May25_10-00-51_Beaver/1621929653.6101258/events.out.tfevents.1621929653.Beaver.142785.1 differ
diff --git a/runs/May25_10-00-51_Beaver/events.out.tfevents.1621929653.Beaver.142785.0 b/runs/May25_10-00-51_Beaver/events.out.tfevents.1621929653.Beaver.142785.0
new file mode 100644
index 0000000000000000000000000000000000000000..476ae3287b03ed7891fa3a0715541b1957a8f56a
Binary files /dev/null and b/runs/May25_10-00-51_Beaver/events.out.tfevents.1621929653.Beaver.142785.0 differ
diff --git a/runs/May25_10-00-53_Beaver/1621929653.6463249/events.out.tfevents.1621929653.Beaver.142785.3 b/runs/May25_10-00-53_Beaver/1621929653.6463249/events.out.tfevents.1621929653.Beaver.142785.3
new file mode 100644
index 0000000000000000000000000000000000000000..8a12d580e9bb8377293e158b6ec34f22ba4da06f
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/1621929653.6463249/events.out.tfevents.1621929653.Beaver.142785.3 differ
diff --git a/runs/May25_10-00-53_Beaver/1621929653.680276/events.out.tfevents.1621929653.Beaver.142785.5 b/runs/May25_10-00-53_Beaver/1621929653.680276/events.out.tfevents.1621929653.Beaver.142785.5
new file mode 100644
index 0000000000000000000000000000000000000000..f1d43897dac155bb94586f227fbeeeb306cd3704
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/1621929653.680276/events.out.tfevents.1621929653.Beaver.142785.5 differ
diff --git a/runs/May25_10-00-53_Beaver/1621929653.7682738/events.out.tfevents.1621929653.Beaver.142785.7 b/runs/May25_10-00-53_Beaver/1621929653.7682738/events.out.tfevents.1621929653.Beaver.142785.7
new file mode 100644
index 0000000000000000000000000000000000000000..72597c3a7dd9a455472aca4b7c30d7b1a7f5de86
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/1621929653.7682738/events.out.tfevents.1621929653.Beaver.142785.7 differ
diff --git a/runs/May25_10-00-53_Beaver/1621929653.8453882/events.out.tfevents.1621929653.Beaver.142785.9 b/runs/May25_10-00-53_Beaver/1621929653.8453882/events.out.tfevents.1621929653.Beaver.142785.9
new file mode 100644
index 0000000000000000000000000000000000000000..1ea5f2f34d212cd33cc22f6360e11946917cfcea
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/1621929653.8453882/events.out.tfevents.1621929653.Beaver.142785.9 differ
diff --git a/runs/May25_10-00-53_Beaver/1621929653.9061093/events.out.tfevents.1621929653.Beaver.142785.11 b/runs/May25_10-00-53_Beaver/1621929653.9061093/events.out.tfevents.1621929653.Beaver.142785.11
new file mode 100644
index 0000000000000000000000000000000000000000..81972c5242fba139ba45443a363fb9abf2be051f
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/1621929653.9061093/events.out.tfevents.1621929653.Beaver.142785.11 differ
diff --git a/runs/May25_10-00-53_Beaver/1621929653.9888256/events.out.tfevents.1621929653.Beaver.142785.13 b/runs/May25_10-00-53_Beaver/1621929653.9888256/events.out.tfevents.1621929653.Beaver.142785.13
new file mode 100644
index 0000000000000000000000000000000000000000..caa5c3593c02845dc042bbbb354e9077fba110a9
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/1621929653.9888256/events.out.tfevents.1621929653.Beaver.142785.13 differ
diff --git a/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.10 b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.10
new file mode 100644
index 0000000000000000000000000000000000000000..f0bace8a94b34d9edc16e225fdae8de7d9a8ceee
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.10 differ
diff --git a/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.12 b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.12
new file mode 100644
index 0000000000000000000000000000000000000000..418857da3551cf6a4fd88872a99403c0f8a95a37
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.12 differ
diff --git a/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.2 b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.2
new file mode 100644
index 0000000000000000000000000000000000000000..789c2e76a38b887193d2def71bda138c9c17f5b0
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.2 differ
diff --git a/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.4 b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.4
new file mode 100644
index 0000000000000000000000000000000000000000..f5be3a75a7d69a640debb16c74afd6b60fe426ba
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.4 differ
diff --git a/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.6 b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.6
new file mode 100644
index 0000000000000000000000000000000000000000..e300f3da3b553af1d383c8b01943cdfaed97d621
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.6 differ
diff --git a/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.8 b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.8
new file mode 100644
index 0000000000000000000000000000000000000000..2b8f6a99c3160776587f364383f264ab49809faa
Binary files /dev/null and b/runs/May25_10-00-53_Beaver/events.out.tfevents.1621929653.Beaver.142785.8 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.0658596/events.out.tfevents.1621929654.Beaver.142785.15 b/runs/May25_10-00-54_Beaver/1621929654.0658596/events.out.tfevents.1621929654.Beaver.142785.15
new file mode 100644
index 0000000000000000000000000000000000000000..51887fa1dfa85de2dbd66d8f31deef7631e128ad
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.0658596/events.out.tfevents.1621929654.Beaver.142785.15 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.1417844/events.out.tfevents.1621929654.Beaver.142785.17 b/runs/May25_10-00-54_Beaver/1621929654.1417844/events.out.tfevents.1621929654.Beaver.142785.17
new file mode 100644
index 0000000000000000000000000000000000000000..85813bdecd5c825979a0061f2a042b021cc213fd
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.1417844/events.out.tfevents.1621929654.Beaver.142785.17 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.176249/events.out.tfevents.1621929654.Beaver.142785.19 b/runs/May25_10-00-54_Beaver/1621929654.176249/events.out.tfevents.1621929654.Beaver.142785.19
new file mode 100644
index 0000000000000000000000000000000000000000..affc59605c563c3fbb39b7f7f87a1e455869406e
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.176249/events.out.tfevents.1621929654.Beaver.142785.19 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.2148995/events.out.tfevents.1621929654.Beaver.142785.21 b/runs/May25_10-00-54_Beaver/1621929654.2148995/events.out.tfevents.1621929654.Beaver.142785.21
new file mode 100644
index 0000000000000000000000000000000000000000..3947b5cc73005c5a74a60abdaecb7c2ebc61d0ba
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.2148995/events.out.tfevents.1621929654.Beaver.142785.21 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.2477686/events.out.tfevents.1621929654.Beaver.142785.23 b/runs/May25_10-00-54_Beaver/1621929654.2477686/events.out.tfevents.1621929654.Beaver.142785.23
new file mode 100644
index 0000000000000000000000000000000000000000..b726d4be3eb4595ee2cb5367e1205c5bbd11d7f8
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.2477686/events.out.tfevents.1621929654.Beaver.142785.23 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.2896044/events.out.tfevents.1621929654.Beaver.142785.25 b/runs/May25_10-00-54_Beaver/1621929654.2896044/events.out.tfevents.1621929654.Beaver.142785.25
new file mode 100644
index 0000000000000000000000000000000000000000..1553c09ccf1c0ad57196a943775b3945f6eb3353
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.2896044/events.out.tfevents.1621929654.Beaver.142785.25 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.3196106/events.out.tfevents.1621929654.Beaver.142785.27 b/runs/May25_10-00-54_Beaver/1621929654.3196106/events.out.tfevents.1621929654.Beaver.142785.27
new file mode 100644
index 0000000000000000000000000000000000000000..3f4c8296b2c97337bc3c0348aa77b00706a5dfe1
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.3196106/events.out.tfevents.1621929654.Beaver.142785.27 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.3518898/events.out.tfevents.1621929654.Beaver.142785.29 b/runs/May25_10-00-54_Beaver/1621929654.3518898/events.out.tfevents.1621929654.Beaver.142785.29
new file mode 100644
index 0000000000000000000000000000000000000000..828254b8b432d61ce14375b82493bd122092723c
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.3518898/events.out.tfevents.1621929654.Beaver.142785.29 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.4202127/events.out.tfevents.1621929654.Beaver.142785.33 b/runs/May25_10-00-54_Beaver/1621929654.4202127/events.out.tfevents.1621929654.Beaver.142785.33
new file mode 100644
index 0000000000000000000000000000000000000000..4de7ce7d374e8e78f1aa21506d2252e730d7f0ca
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.4202127/events.out.tfevents.1621929654.Beaver.142785.33 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.450152/events.out.tfevents.1621929654.Beaver.142785.35 b/runs/May25_10-00-54_Beaver/1621929654.450152/events.out.tfevents.1621929654.Beaver.142785.35
new file mode 100644
index 0000000000000000000000000000000000000000..ead417f00d0e7c8d413756189c9f2811e94913a9
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.450152/events.out.tfevents.1621929654.Beaver.142785.35 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.4812932/events.out.tfevents.1621929654.Beaver.142785.37 b/runs/May25_10-00-54_Beaver/1621929654.4812932/events.out.tfevents.1621929654.Beaver.142785.37
new file mode 100644
index 0000000000000000000000000000000000000000..3fc080a52f2ebdc86c57ac418a72ff58a0e93079
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.4812932/events.out.tfevents.1621929654.Beaver.142785.37 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.5179353/events.out.tfevents.1621929654.Beaver.142785.39 b/runs/May25_10-00-54_Beaver/1621929654.5179353/events.out.tfevents.1621929654.Beaver.142785.39
new file mode 100644
index 0000000000000000000000000000000000000000..942bfdf8a9f1730947aab6c83542542e0d4cdf63
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.5179353/events.out.tfevents.1621929654.Beaver.142785.39 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.5247114/events.out.tfevents.1621929654.Beaver.142785.41 b/runs/May25_10-00-54_Beaver/1621929654.5247114/events.out.tfevents.1621929654.Beaver.142785.41
new file mode 100644
index 0000000000000000000000000000000000000000..f1a58b63feec9908fc09f6aab673dd1cf6b403b1
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.5247114/events.out.tfevents.1621929654.Beaver.142785.41 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.5589337/events.out.tfevents.1621929654.Beaver.142785.43 b/runs/May25_10-00-54_Beaver/1621929654.5589337/events.out.tfevents.1621929654.Beaver.142785.43
new file mode 100644
index 0000000000000000000000000000000000000000..a9f9ea53b20280a6944bb8f088f2935f89b192a2
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.5589337/events.out.tfevents.1621929654.Beaver.142785.43 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.6079504/events.out.tfevents.1621929654.Beaver.142785.47 b/runs/May25_10-00-54_Beaver/1621929654.6079504/events.out.tfevents.1621929654.Beaver.142785.47
new file mode 100644
index 0000000000000000000000000000000000000000..71638a34ce96daf99792d46424a818d02e049e1b
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.6079504/events.out.tfevents.1621929654.Beaver.142785.47 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.6400998/events.out.tfevents.1621929654.Beaver.142785.49 b/runs/May25_10-00-54_Beaver/1621929654.6400998/events.out.tfevents.1621929654.Beaver.142785.49
new file mode 100644
index 0000000000000000000000000000000000000000..d082112858b2892fbb658de13af8100d67c04e98
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.6400998/events.out.tfevents.1621929654.Beaver.142785.49 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.6879833/events.out.tfevents.1621929654.Beaver.142785.52 b/runs/May25_10-00-54_Beaver/1621929654.6879833/events.out.tfevents.1621929654.Beaver.142785.52
new file mode 100644
index 0000000000000000000000000000000000000000..afcbcb7b659701afda041ddf25632cedc4a16115
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.6879833/events.out.tfevents.1621929654.Beaver.142785.52 differ
diff --git a/runs/May25_10-00-54_Beaver/1621929654.721376/events.out.tfevents.1621929654.Beaver.142785.54 b/runs/May25_10-00-54_Beaver/1621929654.721376/events.out.tfevents.1621929654.Beaver.142785.54
new file mode 100644
index 0000000000000000000000000000000000000000..3be1f1cf30b4c6e54c0016e600f37367bbc14382
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/1621929654.721376/events.out.tfevents.1621929654.Beaver.142785.54 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.14 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.14
new file mode 100644
index 0000000000000000000000000000000000000000..d215fd815c39158e03c8f24c36e9933e7d6ebfd9
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.14 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.16 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.16
new file mode 100644
index 0000000000000000000000000000000000000000..8af3570e6d3a2f80bb614b4ace3aa50c8b2bd731
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.16 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.18 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.18
new file mode 100644
index 0000000000000000000000000000000000000000..910a5a7ee0e140c25842ab5c412eec5a6a19d952
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.18 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.20 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.20
new file mode 100644
index 0000000000000000000000000000000000000000..871ffd8d8c02ae6ecef0723cc4d1350ade955937
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.20 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.22 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.22
new file mode 100644
index 0000000000000000000000000000000000000000..7a12a8b15524e4871c2143837c5c8880f5070d4b
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.22 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.24 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.24
new file mode 100644
index 0000000000000000000000000000000000000000..aa2bc18328e894389aa94c4b61c74e07eeb2ccbc
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.24 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.26 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.26
new file mode 100644
index 0000000000000000000000000000000000000000..9d026b297775cb7e137760467d142f0b4302a6af
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.26 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.28 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.28
new file mode 100644
index 0000000000000000000000000000000000000000..3e008914e34a6a67f11cbcf7513aef782ce4a8fd
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.28 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.30 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.30
new file mode 100644
index 0000000000000000000000000000000000000000..6d08687a7b3ce20fda0835a339a33af618f976e9
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.30 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.31 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.31
new file mode 100644
index 0000000000000000000000000000000000000000..c266ecaa47f9a01cbbac678c0790535d2b4b9bf5
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.31 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.32 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.32
new file mode 100644
index 0000000000000000000000000000000000000000..3a97730ba1fb3c806516d4de2fc77004dce3cd11
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.32 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.34 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.34
new file mode 100644
index 0000000000000000000000000000000000000000..10e3ca9c321ad7dc7bc44f660a24e6b5fcfbe012
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.34 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.36 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.36
new file mode 100644
index 0000000000000000000000000000000000000000..0a4385d5ef77c255593076c980ed9766206cee1a
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.36 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.38 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.38
new file mode 100644
index 0000000000000000000000000000000000000000..2cd385983f0b8ec520804cd34dff904862d9ae18
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.38 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.40 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.40
new file mode 100644
index 0000000000000000000000000000000000000000..fde5d39752f25a7865258f835837f8441a2310e7
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.40 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.42 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.42
new file mode 100644
index 0000000000000000000000000000000000000000..564f2fee665739dca141e5759f868ecc4ab3a0a5
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.42 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.44 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.44
new file mode 100644
index 0000000000000000000000000000000000000000..c9c33ca3f9ba886b195396fa7138b765b78eb1aa
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.44 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.45 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.45
new file mode 100644
index 0000000000000000000000000000000000000000..0c4426e48915ac615761dfaf175f455edb358da4
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.45 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.46 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.46
new file mode 100644
index 0000000000000000000000000000000000000000..e19a7b6164bbbaf920f9a9ec800c2d2934516fb1
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.46 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.48 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.48
new file mode 100644
index 0000000000000000000000000000000000000000..30515f1b2658734b1623df1a0dfde4273c397786
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.48 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.50 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.50
new file mode 100644
index 0000000000000000000000000000000000000000..62d30bada42224c91038257f63f2b9d58c7ee5de
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.50 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.51 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.51
new file mode 100644
index 0000000000000000000000000000000000000000..fc1ba5f8e6bb14f8c9b29e7afb57039f558d93f3
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.51 differ
diff --git a/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.53 b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.53
new file mode 100644
index 0000000000000000000000000000000000000000..4cc48530d7850ccbbd2a2548fce4c991f2261abf
Binary files /dev/null and b/runs/May25_10-00-54_Beaver/events.out.tfevents.1621929654.Beaver.142785.53 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.0138834/events.out.tfevents.1621929655.Beaver.142785.56 b/runs/May25_10-00-55_Beaver/1621929655.0138834/events.out.tfevents.1621929655.Beaver.142785.56
new file mode 100644
index 0000000000000000000000000000000000000000..96c914834bb93fb595cbc4ca450e11867d28c891
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.0138834/events.out.tfevents.1621929655.Beaver.142785.56 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.0459123/events.out.tfevents.1621929655.Beaver.142785.58 b/runs/May25_10-00-55_Beaver/1621929655.0459123/events.out.tfevents.1621929655.Beaver.142785.58
new file mode 100644
index 0000000000000000000000000000000000000000..c8467871eeeca8d6cd98138314fa8a009cea9c13
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.0459123/events.out.tfevents.1621929655.Beaver.142785.58 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.0781739/events.out.tfevents.1621929655.Beaver.142785.60 b/runs/May25_10-00-55_Beaver/1621929655.0781739/events.out.tfevents.1621929655.Beaver.142785.60
new file mode 100644
index 0000000000000000000000000000000000000000..964cdbba7a98bd1c1739fc3f4d66ce746fc62cf6
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.0781739/events.out.tfevents.1621929655.Beaver.142785.60 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.1095493/events.out.tfevents.1621929655.Beaver.142785.62 b/runs/May25_10-00-55_Beaver/1621929655.1095493/events.out.tfevents.1621929655.Beaver.142785.62
new file mode 100644
index 0000000000000000000000000000000000000000..fa048f2a7b1e54184cf3a97a4fa801fdde588755
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.1095493/events.out.tfevents.1621929655.Beaver.142785.62 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.1402154/events.out.tfevents.1621929655.Beaver.142785.64 b/runs/May25_10-00-55_Beaver/1621929655.1402154/events.out.tfevents.1621929655.Beaver.142785.64
new file mode 100644
index 0000000000000000000000000000000000000000..d606d7d9f6871cf528f0291aec2406b0a302126d
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.1402154/events.out.tfevents.1621929655.Beaver.142785.64 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.7117682/events.out.tfevents.1621929655.Beaver.142785.68 b/runs/May25_10-00-55_Beaver/1621929655.7117682/events.out.tfevents.1621929655.Beaver.142785.68
new file mode 100644
index 0000000000000000000000000000000000000000..d9a9bd43a7f3a41afe9d59f535968a49e3e80f94
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.7117682/events.out.tfevents.1621929655.Beaver.142785.68 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.7425992/events.out.tfevents.1621929655.Beaver.142785.70 b/runs/May25_10-00-55_Beaver/1621929655.7425992/events.out.tfevents.1621929655.Beaver.142785.70
new file mode 100644
index 0000000000000000000000000000000000000000..fc4a770cb64c33755320b6e941a8722e34ca62fc
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.7425992/events.out.tfevents.1621929655.Beaver.142785.70 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.776609/events.out.tfevents.1621929655.Beaver.142785.72 b/runs/May25_10-00-55_Beaver/1621929655.776609/events.out.tfevents.1621929655.Beaver.142785.72
new file mode 100644
index 0000000000000000000000000000000000000000..45bcec413e941b53ee098e2795cedeaa60aaefdd
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.776609/events.out.tfevents.1621929655.Beaver.142785.72 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.8211157/events.out.tfevents.1621929655.Beaver.142785.74 b/runs/May25_10-00-55_Beaver/1621929655.8211157/events.out.tfevents.1621929655.Beaver.142785.74
new file mode 100644
index 0000000000000000000000000000000000000000..a6e4c0214ade83eabd5c3855726cf6dabada4181
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.8211157/events.out.tfevents.1621929655.Beaver.142785.74 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.8534324/events.out.tfevents.1621929655.Beaver.142785.76 b/runs/May25_10-00-55_Beaver/1621929655.8534324/events.out.tfevents.1621929655.Beaver.142785.76
new file mode 100644
index 0000000000000000000000000000000000000000..5b5853490b344797f85fa24313f651131a97a86c
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.8534324/events.out.tfevents.1621929655.Beaver.142785.76 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.886793/events.out.tfevents.1621929655.Beaver.142785.78 b/runs/May25_10-00-55_Beaver/1621929655.886793/events.out.tfevents.1621929655.Beaver.142785.78
new file mode 100644
index 0000000000000000000000000000000000000000..4dc4cf969999b21bc2e83dec1adbe5141181ae6b
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.886793/events.out.tfevents.1621929655.Beaver.142785.78 differ
diff --git a/runs/May25_10-00-55_Beaver/1621929655.9620328/events.out.tfevents.1621929655.Beaver.142785.81 b/runs/May25_10-00-55_Beaver/1621929655.9620328/events.out.tfevents.1621929655.Beaver.142785.81
new file mode 100644
index 0000000000000000000000000000000000000000..999f029245b88079e3ee26241cc07e161cdb9fbd
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/1621929655.9620328/events.out.tfevents.1621929655.Beaver.142785.81 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.55 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.55
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6333b4e17be633fe5d31a9f9c96f677382d79
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.55 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.57 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.57
new file mode 100644
index 0000000000000000000000000000000000000000..da6aa3e3943f85dcf4cef439f7e33cb3d42983a3
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.57 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.59 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.59
new file mode 100644
index 0000000000000000000000000000000000000000..0c366bfc5f266398f8ece626e5a52cfc06fb66af
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.59 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.61 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.61
new file mode 100644
index 0000000000000000000000000000000000000000..26d39a64ce6f969d9162c94cdee9b45f4767118b
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.61 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.63 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.63
new file mode 100644
index 0000000000000000000000000000000000000000..8fe38ff2b94f160cfb984058f20f0d531b5df00b
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.63 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.65 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.65
new file mode 100644
index 0000000000000000000000000000000000000000..5e237919eece4fe7570612aad1e9f1863a075b41
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.65 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.66 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.66
new file mode 100644
index 0000000000000000000000000000000000000000..dd91e8652a2a0508b437214a78a0298059d0a9cb
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.66 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.67 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.67
new file mode 100644
index 0000000000000000000000000000000000000000..cd999e2d86586c5e1730d1df86c2d5902779a112
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.67 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.69 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.69
new file mode 100644
index 0000000000000000000000000000000000000000..c34d0df6da0a8f895ce6ea19f459182f6f597293
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.69 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.71 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.71
new file mode 100644
index 0000000000000000000000000000000000000000..2550d8b45f7a26e274a80b0a880a436e9f236718
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.71 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.73 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.73
new file mode 100644
index 0000000000000000000000000000000000000000..0fe3381d0109cfdeeb3bab3e98560dc46d1d89da
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.73 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.75 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.75
new file mode 100644
index 0000000000000000000000000000000000000000..ed5922c9e1a7d3db27d192f4dcf3145120e03ece
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.75 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.77 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.77
new file mode 100644
index 0000000000000000000000000000000000000000..0361879bb940db38a6f3b60491b9c51821e76867
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.77 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.79 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.79
new file mode 100644
index 0000000000000000000000000000000000000000..1878192d59e0ad7250607d7106d759e4e79b32a1
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.79 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.80 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.80
new file mode 100644
index 0000000000000000000000000000000000000000..d277043a097104491eb99900c0bd660a8080f747
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929655.Beaver.142785.80 differ
diff --git a/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929656.Beaver.142785.82 b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929656.Beaver.142785.82
new file mode 100644
index 0000000000000000000000000000000000000000..6f7fd3189d9a5c89256142599135fbc8f3e3a5e0
Binary files /dev/null and b/runs/May25_10-00-55_Beaver/events.out.tfevents.1621929656.Beaver.142785.82 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.0435553/events.out.tfevents.1621929656.Beaver.142785.84 b/runs/May25_10-00-56_Beaver/1621929656.0435553/events.out.tfevents.1621929656.Beaver.142785.84
new file mode 100644
index 0000000000000000000000000000000000000000..5708dc863f52e60bc2c5e8081522fc05fea92bb7
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.0435553/events.out.tfevents.1621929656.Beaver.142785.84 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.12348/events.out.tfevents.1621929656.Beaver.142785.87 b/runs/May25_10-00-56_Beaver/1621929656.12348/events.out.tfevents.1621929656.Beaver.142785.87
new file mode 100644
index 0000000000000000000000000000000000000000..3a9fcee3286f950f64089bc34acc5514dde35cce
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.12348/events.out.tfevents.1621929656.Beaver.142785.87 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.1999166/events.out.tfevents.1621929656.Beaver.142785.90 b/runs/May25_10-00-56_Beaver/1621929656.1999166/events.out.tfevents.1621929656.Beaver.142785.90
new file mode 100644
index 0000000000000000000000000000000000000000..5c1cd8b4e2c8435a00c6a521d515523c93aa8a1e
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.1999166/events.out.tfevents.1621929656.Beaver.142785.90 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.2327192/events.out.tfevents.1621929656.Beaver.142785.92 b/runs/May25_10-00-56_Beaver/1621929656.2327192/events.out.tfevents.1621929656.Beaver.142785.92
new file mode 100644
index 0000000000000000000000000000000000000000..b8325316dfca947197a340764179a78d17771e65
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.2327192/events.out.tfevents.1621929656.Beaver.142785.92 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.4396164/events.out.tfevents.1621929656.Beaver.142785.94 b/runs/May25_10-00-56_Beaver/1621929656.4396164/events.out.tfevents.1621929656.Beaver.142785.94
new file mode 100644
index 0000000000000000000000000000000000000000..7daa9f197f158ba651e79815af5ec20c9483a173
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.4396164/events.out.tfevents.1621929656.Beaver.142785.94 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.781376/events.out.tfevents.1621929656.Beaver.142785.97 b/runs/May25_10-00-56_Beaver/1621929656.781376/events.out.tfevents.1621929656.Beaver.142785.97
new file mode 100644
index 0000000000000000000000000000000000000000..8f6ee2ec1d37273116da7ee54c6642a57c4d059e
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.781376/events.out.tfevents.1621929656.Beaver.142785.97 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.8229332/events.out.tfevents.1621929656.Beaver.142785.100 b/runs/May25_10-00-56_Beaver/1621929656.8229332/events.out.tfevents.1621929656.Beaver.142785.100
new file mode 100644
index 0000000000000000000000000000000000000000..70f979affc10660ee6bbee1ef1c6f2db09483fad
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.8229332/events.out.tfevents.1621929656.Beaver.142785.100 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.8529081/events.out.tfevents.1621929656.Beaver.142785.102 b/runs/May25_10-00-56_Beaver/1621929656.8529081/events.out.tfevents.1621929656.Beaver.142785.102
new file mode 100644
index 0000000000000000000000000000000000000000..0b155ff804bfeda0998ee016c0d0b8cb9ef28d18
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.8529081/events.out.tfevents.1621929656.Beaver.142785.102 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.8827977/events.out.tfevents.1621929656.Beaver.142785.104 b/runs/May25_10-00-56_Beaver/1621929656.8827977/events.out.tfevents.1621929656.Beaver.142785.104
new file mode 100644
index 0000000000000000000000000000000000000000..6c8e88813d8b399c81b6c226e5336c0d0451f35c
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.8827977/events.out.tfevents.1621929656.Beaver.142785.104 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.9086256/events.out.tfevents.1621929656.Beaver.142785.106 b/runs/May25_10-00-56_Beaver/1621929656.9086256/events.out.tfevents.1621929656.Beaver.142785.106
new file mode 100644
index 0000000000000000000000000000000000000000..a4cd4c5c2cb43944c65887ade4ab9662cc5d1f95
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.9086256/events.out.tfevents.1621929656.Beaver.142785.106 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.9346185/events.out.tfevents.1621929656.Beaver.142785.108 b/runs/May25_10-00-56_Beaver/1621929656.9346185/events.out.tfevents.1621929656.Beaver.142785.108
new file mode 100644
index 0000000000000000000000000000000000000000..4b128f32669a7ad2a189d9c745dc1b006b922710
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.9346185/events.out.tfevents.1621929656.Beaver.142785.108 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.963316/events.out.tfevents.1621929656.Beaver.142785.110 b/runs/May25_10-00-56_Beaver/1621929656.963316/events.out.tfevents.1621929656.Beaver.142785.110
new file mode 100644
index 0000000000000000000000000000000000000000..0aa3e356c548a14c5eb91b70be96c0f5aa4aa2d9
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.963316/events.out.tfevents.1621929656.Beaver.142785.110 differ
diff --git a/runs/May25_10-00-56_Beaver/1621929656.9926782/events.out.tfevents.1621929656.Beaver.142785.112 b/runs/May25_10-00-56_Beaver/1621929656.9926782/events.out.tfevents.1621929656.Beaver.142785.112
new file mode 100644
index 0000000000000000000000000000000000000000..334f40a4a3be13dbd4d8194c73fc8686c244abf4
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/1621929656.9926782/events.out.tfevents.1621929656.Beaver.142785.112 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.101 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.101
new file mode 100644
index 0000000000000000000000000000000000000000..35ff114dbc44b24a7db545755d7c1f9b5ce1844f
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.101 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.103 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.103
new file mode 100644
index 0000000000000000000000000000000000000000..d6c98ae78a66d648fbc9fc0fa50fc5d113ab7d0d
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.103 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.105 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.105
new file mode 100644
index 0000000000000000000000000000000000000000..493c06af8c05a3778c42c66eb81e4c2bac7097e5
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.105 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.107 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.107
new file mode 100644
index 0000000000000000000000000000000000000000..55d9bcb275fbeeff4f4d0388a392f915e95e8995
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.107 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.109 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.109
new file mode 100644
index 0000000000000000000000000000000000000000..bcd6298cab9c8204f23eef49d1680b99b243c055
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.109 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.111 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.111
new file mode 100644
index 0000000000000000000000000000000000000000..2c93f89890fd77209a8b5f64b326c84f1e34c7f1
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.111 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.83 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.83
new file mode 100644
index 0000000000000000000000000000000000000000..5ff0771b3dbdb6f9a0c0a9b340758fc11210aec7
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.83 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.85 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.85
new file mode 100644
index 0000000000000000000000000000000000000000..e2bbf87e8860605b9282500bf75d9ce7ebbbb3ae
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.85 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.86 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.86
new file mode 100644
index 0000000000000000000000000000000000000000..dd46bbcc745fd7b2f9241a456ee87bff6530f243
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.86 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.88 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.88
new file mode 100644
index 0000000000000000000000000000000000000000..73410a0f983d03800eb40583132458e7cc9ca913
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.88 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.89 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.89
new file mode 100644
index 0000000000000000000000000000000000000000..b6a1c58de09eaf1f517bbed82d57b83ff06f887c
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.89 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.91 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.91
new file mode 100644
index 0000000000000000000000000000000000000000..d895a9647f02825e3c0017554d3f7400a4340db6
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.91 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.93 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.93
new file mode 100644
index 0000000000000000000000000000000000000000..3a32ad06f3604b1e62586fb5ba63bfa58799e89d
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.93 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.95 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.95
new file mode 100644
index 0000000000000000000000000000000000000000..7318aa09fab58eceb4bb91b20cb43cc6a7bc5426
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.95 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.96 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.96
new file mode 100644
index 0000000000000000000000000000000000000000..6282254986de154f25644ff489f3895063788f93
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.96 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.98 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.98
new file mode 100644
index 0000000000000000000000000000000000000000..c1e289e4035e3f6ed3795d5a7c8ce392a6141f8f
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.98 differ
diff --git a/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.99 b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.99
new file mode 100644
index 0000000000000000000000000000000000000000..0c335b09d92590f9d6f67c6d37bff22c30ebbf48
Binary files /dev/null and b/runs/May25_10-00-56_Beaver/events.out.tfevents.1621929656.Beaver.142785.99 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.0256495/events.out.tfevents.1621929657.Beaver.142785.114 b/runs/May25_10-00-57_Beaver/1621929657.0256495/events.out.tfevents.1621929657.Beaver.142785.114
new file mode 100644
index 0000000000000000000000000000000000000000..da78f665a70b66fc542415bd19d9638c9a45a27b
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.0256495/events.out.tfevents.1621929657.Beaver.142785.114 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.0594552/events.out.tfevents.1621929657.Beaver.142785.116 b/runs/May25_10-00-57_Beaver/1621929657.0594552/events.out.tfevents.1621929657.Beaver.142785.116
new file mode 100644
index 0000000000000000000000000000000000000000..f4074d0ca60d2295999254372c8e92af0ed8e765
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.0594552/events.out.tfevents.1621929657.Beaver.142785.116 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.0896955/events.out.tfevents.1621929657.Beaver.142785.118 b/runs/May25_10-00-57_Beaver/1621929657.0896955/events.out.tfevents.1621929657.Beaver.142785.118
new file mode 100644
index 0000000000000000000000000000000000000000..49e6e5328fcfc15bcf035b3fde6cd44b2aa5fcd2
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.0896955/events.out.tfevents.1621929657.Beaver.142785.118 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.1046095/events.out.tfevents.1621929657.Beaver.142785.120 b/runs/May25_10-00-57_Beaver/1621929657.1046095/events.out.tfevents.1621929657.Beaver.142785.120
new file mode 100644
index 0000000000000000000000000000000000000000..7dd0eec49bae59f592cd34fbdfae31952cdfdd90
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.1046095/events.out.tfevents.1621929657.Beaver.142785.120 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.1207852/events.out.tfevents.1621929657.Beaver.142785.122 b/runs/May25_10-00-57_Beaver/1621929657.1207852/events.out.tfevents.1621929657.Beaver.142785.122
new file mode 100644
index 0000000000000000000000000000000000000000..f8c35af96a5439e0ab46ce0ffe8f653c8b3b1aae
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.1207852/events.out.tfevents.1621929657.Beaver.142785.122 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.152045/events.out.tfevents.1621929657.Beaver.142785.124 b/runs/May25_10-00-57_Beaver/1621929657.152045/events.out.tfevents.1621929657.Beaver.142785.124
new file mode 100644
index 0000000000000000000000000000000000000000..5a3f72dd7b9608c21f8bad689ff10b36c58718cb
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.152045/events.out.tfevents.1621929657.Beaver.142785.124 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.1835978/events.out.tfevents.1621929657.Beaver.142785.126 b/runs/May25_10-00-57_Beaver/1621929657.1835978/events.out.tfevents.1621929657.Beaver.142785.126
new file mode 100644
index 0000000000000000000000000000000000000000..09cd7e259d0ae9f7df2412860580d1275401c976
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.1835978/events.out.tfevents.1621929657.Beaver.142785.126 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.2131639/events.out.tfevents.1621929657.Beaver.142785.128 b/runs/May25_10-00-57_Beaver/1621929657.2131639/events.out.tfevents.1621929657.Beaver.142785.128
new file mode 100644
index 0000000000000000000000000000000000000000..5937c203e5ec344765df7b84628c8d7e0b7a112d
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.2131639/events.out.tfevents.1621929657.Beaver.142785.128 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.2309308/events.out.tfevents.1621929657.Beaver.142785.130 b/runs/May25_10-00-57_Beaver/1621929657.2309308/events.out.tfevents.1621929657.Beaver.142785.130
new file mode 100644
index 0000000000000000000000000000000000000000..cc4da9df0275b768abc4e597463c28c86bf3c626
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.2309308/events.out.tfevents.1621929657.Beaver.142785.130 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.2479937/events.out.tfevents.1621929657.Beaver.142785.132 b/runs/May25_10-00-57_Beaver/1621929657.2479937/events.out.tfevents.1621929657.Beaver.142785.132
new file mode 100644
index 0000000000000000000000000000000000000000..00eccf5c4360d03781c00e6d838fd86b7f03e9c5
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.2479937/events.out.tfevents.1621929657.Beaver.142785.132 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.2793288/events.out.tfevents.1621929657.Beaver.142785.134 b/runs/May25_10-00-57_Beaver/1621929657.2793288/events.out.tfevents.1621929657.Beaver.142785.134
new file mode 100644
index 0000000000000000000000000000000000000000..d64cf95fb7c0d7976ed7afb2e63a99e86b476ea7
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.2793288/events.out.tfevents.1621929657.Beaver.142785.134 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.3396373/events.out.tfevents.1621929657.Beaver.142785.136 b/runs/May25_10-00-57_Beaver/1621929657.3396373/events.out.tfevents.1621929657.Beaver.142785.136
new file mode 100644
index 0000000000000000000000000000000000000000..8db0699cc988ed2df7a2f2dfe81771089d713847
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.3396373/events.out.tfevents.1621929657.Beaver.142785.136 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.3714027/events.out.tfevents.1621929657.Beaver.142785.138 b/runs/May25_10-00-57_Beaver/1621929657.3714027/events.out.tfevents.1621929657.Beaver.142785.138
new file mode 100644
index 0000000000000000000000000000000000000000..fe6a3c46cf879d34af355412508f2149dd14b8f6
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.3714027/events.out.tfevents.1621929657.Beaver.142785.138 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.4159045/events.out.tfevents.1621929657.Beaver.142785.140 b/runs/May25_10-00-57_Beaver/1621929657.4159045/events.out.tfevents.1621929657.Beaver.142785.140
new file mode 100644
index 0000000000000000000000000000000000000000..a9e48e86ddef9bebfdd1ebba59e49f5404f7f2f7
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.4159045/events.out.tfevents.1621929657.Beaver.142785.140 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.449367/events.out.tfevents.1621929657.Beaver.142785.142 b/runs/May25_10-00-57_Beaver/1621929657.449367/events.out.tfevents.1621929657.Beaver.142785.142
new file mode 100644
index 0000000000000000000000000000000000000000..d162fac04199f3610d87662e94267edf254da6e0
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.449367/events.out.tfevents.1621929657.Beaver.142785.142 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.4801264/events.out.tfevents.1621929657.Beaver.142785.144 b/runs/May25_10-00-57_Beaver/1621929657.4801264/events.out.tfevents.1621929657.Beaver.142785.144
new file mode 100644
index 0000000000000000000000000000000000000000..03df19a614977b6183c915eb4ec4205f519980cb
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.4801264/events.out.tfevents.1621929657.Beaver.142785.144 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.5108728/events.out.tfevents.1621929657.Beaver.142785.146 b/runs/May25_10-00-57_Beaver/1621929657.5108728/events.out.tfevents.1621929657.Beaver.142785.146
new file mode 100644
index 0000000000000000000000000000000000000000..3744138abbe3a9cad33eda6234701f5c591afa74
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.5108728/events.out.tfevents.1621929657.Beaver.142785.146 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.5423715/events.out.tfevents.1621929657.Beaver.142785.148 b/runs/May25_10-00-57_Beaver/1621929657.5423715/events.out.tfevents.1621929657.Beaver.142785.148
new file mode 100644
index 0000000000000000000000000000000000000000..e47bd494339651be09e7e5223527d5b1895a3259
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.5423715/events.out.tfevents.1621929657.Beaver.142785.148 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.5736563/events.out.tfevents.1621929657.Beaver.142785.150 b/runs/May25_10-00-57_Beaver/1621929657.5736563/events.out.tfevents.1621929657.Beaver.142785.150
new file mode 100644
index 0000000000000000000000000000000000000000..dec1cae8eba4701efba3934d708f55b2177ecdae
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.5736563/events.out.tfevents.1621929657.Beaver.142785.150 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.6037364/events.out.tfevents.1621929657.Beaver.142785.152 b/runs/May25_10-00-57_Beaver/1621929657.6037364/events.out.tfevents.1621929657.Beaver.142785.152
new file mode 100644
index 0000000000000000000000000000000000000000..ab36a0b0b4870e34de0e18b7f0c2e497bb6ad0b8
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.6037364/events.out.tfevents.1621929657.Beaver.142785.152 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.7471054/events.out.tfevents.1621929657.Beaver.142785.154 b/runs/May25_10-00-57_Beaver/1621929657.7471054/events.out.tfevents.1621929657.Beaver.142785.154
new file mode 100644
index 0000000000000000000000000000000000000000..fcc76054cc299c046662bfe1abe41db034c3961b
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.7471054/events.out.tfevents.1621929657.Beaver.142785.154 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.8854764/events.out.tfevents.1621929657.Beaver.142785.156 b/runs/May25_10-00-57_Beaver/1621929657.8854764/events.out.tfevents.1621929657.Beaver.142785.156
new file mode 100644
index 0000000000000000000000000000000000000000..93be913a31ce3dbf252672d33943f6110019bb6f
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.8854764/events.out.tfevents.1621929657.Beaver.142785.156 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.918847/events.out.tfevents.1621929657.Beaver.142785.158 b/runs/May25_10-00-57_Beaver/1621929657.918847/events.out.tfevents.1621929657.Beaver.142785.158
new file mode 100644
index 0000000000000000000000000000000000000000..b40797f45437ab8080160d769febaa6f64a4bb86
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.918847/events.out.tfevents.1621929657.Beaver.142785.158 differ
diff --git a/runs/May25_10-00-57_Beaver/1621929657.9529428/events.out.tfevents.1621929657.Beaver.142785.160 b/runs/May25_10-00-57_Beaver/1621929657.9529428/events.out.tfevents.1621929657.Beaver.142785.160
new file mode 100644
index 0000000000000000000000000000000000000000..ff7a719da8065cb5613156c35f8058c0efb7fd34
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/1621929657.9529428/events.out.tfevents.1621929657.Beaver.142785.160 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.113 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.113
new file mode 100644
index 0000000000000000000000000000000000000000..e024fe37fafb4150e04e9732cd3d238d8b2bbfea
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.113 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.115 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.115
new file mode 100644
index 0000000000000000000000000000000000000000..0893ee22a8f5b694f47e050480a21860ee864c69
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.115 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.117 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.117
new file mode 100644
index 0000000000000000000000000000000000000000..bb506e7ff697cb50ec4753c027a9e614b0cebd75
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.117 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.119 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.119
new file mode 100644
index 0000000000000000000000000000000000000000..8b07ebc0c513fbb19160cef22b2635a51a33e113
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.119 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.121 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.121
new file mode 100644
index 0000000000000000000000000000000000000000..b1c75ca0ecddbcaa83da360ad35acf1dcce7467c
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.121 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.123 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.123
new file mode 100644
index 0000000000000000000000000000000000000000..5ac15805a7c4b2abd182899f279ce64d150f5322
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.123 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.125 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.125
new file mode 100644
index 0000000000000000000000000000000000000000..fa6bfe885c79fe552273d7b430d3b7ba3fdfc604
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.125 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.127 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.127
new file mode 100644
index 0000000000000000000000000000000000000000..3fd3007077fd58979e7970dc041e9578b16afc30
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.127 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.129 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.129
new file mode 100644
index 0000000000000000000000000000000000000000..358bf66ef0104a1aa6990435aa2d42b38f71b66c
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.129 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.131 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.131
new file mode 100644
index 0000000000000000000000000000000000000000..ce3115e1d93fedf57bfa2876dd42e20d0d79dbd7
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.131 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.133 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.133
new file mode 100644
index 0000000000000000000000000000000000000000..dc556b0fa23d5c7ea758c47f619eb87afefbdc8b
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.133 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.135 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.135
new file mode 100644
index 0000000000000000000000000000000000000000..8cc40328fa4f9c22f1a0aec3579aafd4678c60b6
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.135 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.137 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.137
new file mode 100644
index 0000000000000000000000000000000000000000..00f44d396be97bb2d5a48d45e17d1e8491d9a3be
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.137 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.139 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.139
new file mode 100644
index 0000000000000000000000000000000000000000..a0524498fd755da63a92d6acc6a402c063ccb11b
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.139 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.141 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.141
new file mode 100644
index 0000000000000000000000000000000000000000..c66a588b28d0d704e62a43ff9c4bb427af984850
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.141 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.143 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.143
new file mode 100644
index 0000000000000000000000000000000000000000..5cf1b7d0405f3139f33237ddd3d904b6cdd92abb
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.143 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.145 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.145
new file mode 100644
index 0000000000000000000000000000000000000000..4446bc2eaf890a3299c29207091355160b9ecec3
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.145 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.147 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.147
new file mode 100644
index 0000000000000000000000000000000000000000..a93988e6be894e81bcc46d6443ee9760fb1c9148
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.147 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.149 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.149
new file mode 100644
index 0000000000000000000000000000000000000000..a8fa2b89b6edc1cdd953ee7f984acbb387e9f98a
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.149 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.151 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.151
new file mode 100644
index 0000000000000000000000000000000000000000..edf4944fb0f8c4da3c727ea4b1853a04963d4a81
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.151 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.153 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.153
new file mode 100644
index 0000000000000000000000000000000000000000..4f2d8b8799cc9ad0ad042de530024d7fbb6a66d0
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.153 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.155 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.155
new file mode 100644
index 0000000000000000000000000000000000000000..8410a5f0a1098115ed31986c2d0666daae8bdf1b
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.155 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.157 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.157
new file mode 100644
index 0000000000000000000000000000000000000000..acc852511c136dd37743803733a25d140f04f930
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.157 differ
diff --git a/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.159 b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.159
new file mode 100644
index 0000000000000000000000000000000000000000..739c4c83bade5ce9b6381f6f4f02e00793eefbe0
Binary files /dev/null and b/runs/May25_10-00-57_Beaver/events.out.tfevents.1621929657.Beaver.142785.159 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.0503018/events.out.tfevents.1621929658.Beaver.142785.162 b/runs/May25_10-00-58_Beaver/1621929658.0503018/events.out.tfevents.1621929658.Beaver.142785.162
new file mode 100644
index 0000000000000000000000000000000000000000..72c09915e54e42b753dd3ca30a3123135100bc91
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.0503018/events.out.tfevents.1621929658.Beaver.142785.162 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.145187/events.out.tfevents.1621929658.Beaver.142785.164 b/runs/May25_10-00-58_Beaver/1621929658.145187/events.out.tfevents.1621929658.Beaver.142785.164
new file mode 100644
index 0000000000000000000000000000000000000000..8dc06da214953aba9b2e779a9cd4f027fa4d9363
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.145187/events.out.tfevents.1621929658.Beaver.142785.164 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.1766331/events.out.tfevents.1621929658.Beaver.142785.166 b/runs/May25_10-00-58_Beaver/1621929658.1766331/events.out.tfevents.1621929658.Beaver.142785.166
new file mode 100644
index 0000000000000000000000000000000000000000..08d4ee9b368767f0cf534bb9497a706a9511946f
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.1766331/events.out.tfevents.1621929658.Beaver.142785.166 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.2087781/events.out.tfevents.1621929658.Beaver.142785.168 b/runs/May25_10-00-58_Beaver/1621929658.2087781/events.out.tfevents.1621929658.Beaver.142785.168
new file mode 100644
index 0000000000000000000000000000000000000000..41a74f4e297ae739459ccca383c9d5009b5bde2b
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.2087781/events.out.tfevents.1621929658.Beaver.142785.168 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.286852/events.out.tfevents.1621929658.Beaver.142785.170 b/runs/May25_10-00-58_Beaver/1621929658.286852/events.out.tfevents.1621929658.Beaver.142785.170
new file mode 100644
index 0000000000000000000000000000000000000000..667650db1cd07f3aa0b5a74f8d2036d956937faa
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.286852/events.out.tfevents.1621929658.Beaver.142785.170 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.3468313/events.out.tfevents.1621929658.Beaver.142785.172 b/runs/May25_10-00-58_Beaver/1621929658.3468313/events.out.tfevents.1621929658.Beaver.142785.172
new file mode 100644
index 0000000000000000000000000000000000000000..08c0b9013cfea27c1fe3eac88ad5f079dcfc6fd7
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.3468313/events.out.tfevents.1621929658.Beaver.142785.172 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.3797822/events.out.tfevents.1621929658.Beaver.142785.174 b/runs/May25_10-00-58_Beaver/1621929658.3797822/events.out.tfevents.1621929658.Beaver.142785.174
new file mode 100644
index 0000000000000000000000000000000000000000..9d385310dcde6ff741df225a64bf571adb5a88f3
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.3797822/events.out.tfevents.1621929658.Beaver.142785.174 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.4120576/events.out.tfevents.1621929658.Beaver.142785.176 b/runs/May25_10-00-58_Beaver/1621929658.4120576/events.out.tfevents.1621929658.Beaver.142785.176
new file mode 100644
index 0000000000000000000000000000000000000000..0e12801d8856d6a0fa0af49d4f6be0af9f2fdadf
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.4120576/events.out.tfevents.1621929658.Beaver.142785.176 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.4520016/events.out.tfevents.1621929658.Beaver.142785.178 b/runs/May25_10-00-58_Beaver/1621929658.4520016/events.out.tfevents.1621929658.Beaver.142785.178
new file mode 100644
index 0000000000000000000000000000000000000000..39406227421b58520adf816a1863209b5a79b77e
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.4520016/events.out.tfevents.1621929658.Beaver.142785.178 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.4944272/events.out.tfevents.1621929658.Beaver.142785.180 b/runs/May25_10-00-58_Beaver/1621929658.4944272/events.out.tfevents.1621929658.Beaver.142785.180
new file mode 100644
index 0000000000000000000000000000000000000000..992928f55e002efd969a199395e6181f42cc79d0
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.4944272/events.out.tfevents.1621929658.Beaver.142785.180 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.524788/events.out.tfevents.1621929658.Beaver.142785.182 b/runs/May25_10-00-58_Beaver/1621929658.524788/events.out.tfevents.1621929658.Beaver.142785.182
new file mode 100644
index 0000000000000000000000000000000000000000..61fe9d342c69f40a50f20ee56d4dc433ed14a3e6
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.524788/events.out.tfevents.1621929658.Beaver.142785.182 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.561759/events.out.tfevents.1621929658.Beaver.142785.184 b/runs/May25_10-00-58_Beaver/1621929658.561759/events.out.tfevents.1621929658.Beaver.142785.184
new file mode 100644
index 0000000000000000000000000000000000000000..573c348afddeb6f5cf7b6428d5abf46332f3ef2b
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.561759/events.out.tfevents.1621929658.Beaver.142785.184 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.6087508/events.out.tfevents.1621929658.Beaver.142785.186 b/runs/May25_10-00-58_Beaver/1621929658.6087508/events.out.tfevents.1621929658.Beaver.142785.186
new file mode 100644
index 0000000000000000000000000000000000000000..aed1886493dd684cb48bab13fe59541bafb009c2
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.6087508/events.out.tfevents.1621929658.Beaver.142785.186 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.6411138/events.out.tfevents.1621929658.Beaver.142785.188 b/runs/May25_10-00-58_Beaver/1621929658.6411138/events.out.tfevents.1621929658.Beaver.142785.188
new file mode 100644
index 0000000000000000000000000000000000000000..42c0a68f6f1c7ff4adf71a537c0dc359579a47a5
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.6411138/events.out.tfevents.1621929658.Beaver.142785.188 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.6782937/events.out.tfevents.1621929658.Beaver.142785.190 b/runs/May25_10-00-58_Beaver/1621929658.6782937/events.out.tfevents.1621929658.Beaver.142785.190
new file mode 100644
index 0000000000000000000000000000000000000000..69c6c4d9997b724bc3938555e90a9f0dd6bd14d1
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.6782937/events.out.tfevents.1621929658.Beaver.142785.190 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.85667/events.out.tfevents.1621929658.Beaver.142785.192 b/runs/May25_10-00-58_Beaver/1621929658.85667/events.out.tfevents.1621929658.Beaver.142785.192
new file mode 100644
index 0000000000000000000000000000000000000000..cb0dff7d6458b15870c0d4b8d3749d1f2af5764e
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.85667/events.out.tfevents.1621929658.Beaver.142785.192 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.890611/events.out.tfevents.1621929658.Beaver.142785.194 b/runs/May25_10-00-58_Beaver/1621929658.890611/events.out.tfevents.1621929658.Beaver.142785.194
new file mode 100644
index 0000000000000000000000000000000000000000..c895f02228c0a429c6e2c3e8c9d46a5b22199ef3
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.890611/events.out.tfevents.1621929658.Beaver.142785.194 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.92037/events.out.tfevents.1621929658.Beaver.142785.196 b/runs/May25_10-00-58_Beaver/1621929658.92037/events.out.tfevents.1621929658.Beaver.142785.196
new file mode 100644
index 0000000000000000000000000000000000000000..9079caa185b96932a26f1ddce496d2fb1133e625
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.92037/events.out.tfevents.1621929658.Beaver.142785.196 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.9512928/events.out.tfevents.1621929658.Beaver.142785.198 b/runs/May25_10-00-58_Beaver/1621929658.9512928/events.out.tfevents.1621929658.Beaver.142785.198
new file mode 100644
index 0000000000000000000000000000000000000000..045a169268ddcd98571c80df409bc22e91db3421
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.9512928/events.out.tfevents.1621929658.Beaver.142785.198 differ
diff --git a/runs/May25_10-00-58_Beaver/1621929658.993777/events.out.tfevents.1621929658.Beaver.142785.201 b/runs/May25_10-00-58_Beaver/1621929658.993777/events.out.tfevents.1621929658.Beaver.142785.201
new file mode 100644
index 0000000000000000000000000000000000000000..2ad850985fa7dbf630af8a3f5981237544b15f9b
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/1621929658.993777/events.out.tfevents.1621929658.Beaver.142785.201 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.161 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.161
new file mode 100644
index 0000000000000000000000000000000000000000..362b0ce4441394ed841e8122fd9031057bdc0446
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.161 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.163 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.163
new file mode 100644
index 0000000000000000000000000000000000000000..a92e90dbae7745c40f17048d81014d4527f2ee2a
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.163 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.165 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.165
new file mode 100644
index 0000000000000000000000000000000000000000..858b03d9ddfdd17a115be5aa13779d956786027d
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.165 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.167 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.167
new file mode 100644
index 0000000000000000000000000000000000000000..70316801885155b72f843cfd647cf00b4c41bbfc
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.167 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.169 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.169
new file mode 100644
index 0000000000000000000000000000000000000000..4884f4772addeca59a4259f7faec8055d1a14887
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.169 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.171 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.171
new file mode 100644
index 0000000000000000000000000000000000000000..046ca48036ad7072d546457897edf60b4b9859b4
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.171 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.173 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.173
new file mode 100644
index 0000000000000000000000000000000000000000..07154518aea37d3ff0aea9a7c5a0dcd33086ed75
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.173 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.175 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.175
new file mode 100644
index 0000000000000000000000000000000000000000..eb32558ce81555b56cd55c81059eb9a2505d7c86
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.175 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.177 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.177
new file mode 100644
index 0000000000000000000000000000000000000000..7079da0f8e650ff39734805574510b750281f781
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.177 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.179 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.179
new file mode 100644
index 0000000000000000000000000000000000000000..31c95177925f974cdf163086b629878ee0592a4e
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.179 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.181 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.181
new file mode 100644
index 0000000000000000000000000000000000000000..bdd42408766fadb5990a2c35423477c3187c6517
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.181 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.183 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.183
new file mode 100644
index 0000000000000000000000000000000000000000..d54dc9dc2955b8c46b99faf535db22684d6f594b
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.183 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.185 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.185
new file mode 100644
index 0000000000000000000000000000000000000000..8462fc8728a82b471c1852cd86bf464c45765847
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.185 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.187 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.187
new file mode 100644
index 0000000000000000000000000000000000000000..0b4967efd6e20413f968f6be0d7f25243d3bc9b1
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.187 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.189 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.189
new file mode 100644
index 0000000000000000000000000000000000000000..96f7cb2edece935caabd610f56e7cf129a301900
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.189 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.191 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.191
new file mode 100644
index 0000000000000000000000000000000000000000..77bdef7ad3655308112f52887a10ae84d7f7360f
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.191 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.193 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.193
new file mode 100644
index 0000000000000000000000000000000000000000..74b76c51b1cf1b6c46130c2b905d5d08ec168a0b
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.193 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.195 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.195
new file mode 100644
index 0000000000000000000000000000000000000000..221b33f37ca89f0e6396de64c50dcdbdb6230cd7
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.195 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.197 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.197
new file mode 100644
index 0000000000000000000000000000000000000000..e7c82ca15d9cffdce3257de0d408d5ea5a2abe35
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.197 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.199 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.199
new file mode 100644
index 0000000000000000000000000000000000000000..5e4ba8835110377f81c46139ac2861f8671eb467
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.199 differ
diff --git a/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.200 b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.200
new file mode 100644
index 0000000000000000000000000000000000000000..a8f6a94d43d98121de97f84f1d4b5592a9c6eb9c
Binary files /dev/null and b/runs/May25_10-00-58_Beaver/events.out.tfevents.1621929658.Beaver.142785.200 differ
diff --git a/runs/May25_10-00-59_Beaver/1621929659.0242562/events.out.tfevents.1621929659.Beaver.142785.203 b/runs/May25_10-00-59_Beaver/1621929659.0242562/events.out.tfevents.1621929659.Beaver.142785.203
new file mode 100644
index 0000000000000000000000000000000000000000..224ac8ecdbbe0d3dbf89feaf4d4d10ae735842bc
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/1621929659.0242562/events.out.tfevents.1621929659.Beaver.142785.203 differ
diff --git a/runs/May25_10-00-59_Beaver/1621929659.0552833/events.out.tfevents.1621929659.Beaver.142785.205 b/runs/May25_10-00-59_Beaver/1621929659.0552833/events.out.tfevents.1621929659.Beaver.142785.205
new file mode 100644
index 0000000000000000000000000000000000000000..3e8b515a23600f3ac0831e1ed9885a8f7d774c00
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/1621929659.0552833/events.out.tfevents.1621929659.Beaver.142785.205 differ
diff --git a/runs/May25_10-00-59_Beaver/1621929659.0882845/events.out.tfevents.1621929659.Beaver.142785.207 b/runs/May25_10-00-59_Beaver/1621929659.0882845/events.out.tfevents.1621929659.Beaver.142785.207
new file mode 100644
index 0000000000000000000000000000000000000000..68da99ee4161565a97b28aaf3e499bf6a62e749f
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/1621929659.0882845/events.out.tfevents.1621929659.Beaver.142785.207 differ
diff --git a/runs/May25_10-00-59_Beaver/1621929659.1174583/events.out.tfevents.1621929659.Beaver.142785.209 b/runs/May25_10-00-59_Beaver/1621929659.1174583/events.out.tfevents.1621929659.Beaver.142785.209
new file mode 100644
index 0000000000000000000000000000000000000000..5a17ab6334c215f8228fa241dfd44c6b5c1a7b28
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/1621929659.1174583/events.out.tfevents.1621929659.Beaver.142785.209 differ
diff --git a/runs/May25_10-00-59_Beaver/1621929659.1484091/events.out.tfevents.1621929659.Beaver.142785.211 b/runs/May25_10-00-59_Beaver/1621929659.1484091/events.out.tfevents.1621929659.Beaver.142785.211
new file mode 100644
index 0000000000000000000000000000000000000000..617b5a8247dcc87c940fd5adb34e83d91a7db7c3
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/1621929659.1484091/events.out.tfevents.1621929659.Beaver.142785.211 differ
diff --git a/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.202 b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.202
new file mode 100644
index 0000000000000000000000000000000000000000..592216c3275759a46b42f2c23f3548698672d5f9
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.202 differ
diff --git a/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.204 b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.204
new file mode 100644
index 0000000000000000000000000000000000000000..72a87cb2586fef35486153cbf0cc4777e440ea1a
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.204 differ
diff --git a/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.206 b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.206
new file mode 100644
index 0000000000000000000000000000000000000000..7adb4abb505a83d996de2fd287f8037ae8e9881c
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.206 differ
diff --git a/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.208 b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.208
new file mode 100644
index 0000000000000000000000000000000000000000..d1e704bde509674a24d2ec77958c302052051f89
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.208 differ
diff --git a/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.210 b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.210
new file mode 100644
index 0000000000000000000000000000000000000000..f0bbefcbfffbece39c0c9dca57762b42b00fedba
Binary files /dev/null and b/runs/May25_10-00-59_Beaver/events.out.tfevents.1621929659.Beaver.142785.210 differ
diff --git a/sagemaker/README.md b/sagemaker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6675c190b31acb63f8ca9827eb323ca0f298d82
--- /dev/null
+++ b/sagemaker/README.md
@@ -0,0 +1,148 @@
+# Testing new Hugging Face Deep Learning Container.
+
+This document explains the testing strategy for releasing the new Hugging Face Deep Learning Container. AWS maintains 14 days of currency with framework releases. Besides framework releases, AWS release train is bi-weekly on Monday. Code cutoff date for any changes is the Wednesday before release-Monday. 
+
+
+## Test Case 1: Releasing a New Version (Minor/Major) of 🤗 Transformers
+
+### Requirements: Test should run on Release Candidate for new `transformers` release to validate the new release is compatible with the DLCs. To run these tests you need credentials for the HF SageMaker AWS Account. You can ask @philschmid or @n1t0 to get access.
+
+### Run Tests:
+
+Before we can run the tests we need to adjust the `requirements.txt` for PyTorch under `/tests/sagemaker/scripts/pytorch` and for TensorFlow under `/tests/sagemaker/scripts/pytorch`. We adjust the branch to the new RC-tag.
+
+```
+git+https://github.com/huggingface/transformers.git@v4.5.0.rc0 # install master or adjust ist with vX.X.X for installing version specific-transforms
+```
+
+After we adjusted the `requirements.txt` we can run Amazon SageMaker tests with:  
+
+```bash
+AWS_PROFILE=<enter-your-profile> make test-sagemaker
+```
+These tests take around 10-15 minutes to finish. Preferably make a screenshot of the successfully ran tests.
+
+### After Transformers Release:
+
+After we have released the Release Candidate we need to create a PR at the [Deep Learning Container Repository](https://github.com/aws/deep-learning-containers).
+
+**Creating the update PR:**
+
+1. Update the two latest `buildspec.yaml` config for [PyTorch](https://github.com/aws/deep-learning-containers/tree/master/huggingface/pytorch) and [TensorFlow](https://github.com/aws/deep-learning-containers/tree/master/huggingface/tensorflow). The two latest `buildspec.yaml` are the `buildspec.yaml` without a version tag and the one with the highest framework version, e.g. `buildspec-1-7-1.yml` and not `buildspec-1-6.yml`.  
+
+To update the `buildspec.yaml` we need to adjust either the `transformers_version` or the `datasets_version` or both. Example for upgrading to `transformers 4.5.0` and `datasets 1.6.0`.
+```yaml
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK pytorch
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION 1.6.0
+short_version: &SHORT_VERSION 1.6
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
+      *REPOSITORY_NAME ]
+
+images:
+  BuildHuggingFacePytorchGpuPy37Cu110TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 15000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py36
+    cuda_version: &CUDA_VERSION cu110
+    os_version: &OS_VERSION ubuntu18.04
+    transformers_version: &TRANSFORMERS_VERSION 4.5.0 # this was adjusted from 4.4.2 to 4.5.0
+    datasets_version: &DATASETS_VERSION 1.6.0 # this was adjusted from 1.5.0 to 1.6.0
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
+      *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, 
+      *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+```
+2. In the PR comment describe what test, we ran and with which package versions. Here you can copy the table from [Current Tests](#current-tests). 
+
+2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed. 
+## Test Case 2: Releasing a New AWS Framework DLC
+
+
+## Execute Tests
+
+### Requirements:
+AWS is going to release new DLCs for PyTorch and/or TensorFlow. The Tests should run on the new framework versions with current `transformers` release to validate the new framework release is compatible with the `transformers` version. To run these tests you need credentials for the HF SageMaker AWS Account. You can ask @philschmid or @n1t0 to get access. AWS will notify us with a new issue in the repository pointing to their framework upgrade PR.
+
+### Run Tests:
+
+Before we can run the tests we need to adjust the `requirements.txt` for Pytorch under `/tests/sagemaker/scripts/pytorch` and for Tensorflow under `/tests/sagemaker/scripts/pytorch`. We add the new framework version to it.
+
+```
+torch==1.8.1 # for pytorch
+tensorflow-gpu==2.5.0 # for tensorflow
+```
+
+After we adjusted the `requirements.txt` we can run Amazon SageMaker tests with. 
+
+```bash
+AWS_PROFILE=<enter-your-profile> make test-sagemaker
+```
+These tests take around 10-15 minutes to finish. Preferably make a screenshot of the successfully ran tests.
+
+### After successful Tests:
+
+After we have successfully run tests for the new framework version we need to create a PR at the [Deep Learning Container Repository](https://github.com/aws/deep-learning-containers).
+
+**Creating the update PR:**
+
+1. Create a new `buildspec.yaml` config for [PyTorch](https://github.com/aws/deep-learning-containers/tree/master/huggingface/pytorch) and [TensorFlow](https://github.com/aws/deep-learning-containers/tree/master/huggingface/tensorflow) and rename the old `buildspec.yaml` to `buildespec-x.x.x`, where `x.x.x` is the base framework version, e.g. if pytorch 1.6.0 is the latest version in `buildspec.yaml` the file should be renamed to `buildspec-yaml-1-6.yaml`. 
+
+To create the new `buildspec.yaml` we need to adjust  the `version` and the `short_version`. Example for upgrading to `pytorch 1.7.1`. 
+
+```yaml
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK pytorch
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION 1.7.1 # this was adjusted from 1.6.0 to 1.7.1
+short_version: &SHORT_VERSION 1.7 # this was adjusted from 1.6 to 1.7
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
+      *REPOSITORY_NAME ]
+
+images:
+  BuildHuggingFacePytorchGpuPy37Cu110TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 15000
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py36
+    cuda_version: &CUDA_VERSION cu110
+    os_version: &OS_VERSION ubuntu18.04
+    transformers_version: &TRANSFORMERS_VERSION 4.4.2
+    datasets_version: &DATASETS_VERSION 1.5.0
+    tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
+      *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, 
+      *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+```
+2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1025), which information are needed.
+
+## Current Tests
+
+| ID                                  | Description                                                       | Platform                   | #GPUS | Collected & evaluated metrics            |
+|-------------------------------------|-------------------------------------------------------------------|-----------------------------|-------|------------------------------------------|
+| pytorch-transfromers-test-single    | test bert finetuning using BERT fromtransformerlib+PT             | SageMaker createTrainingJob | 1     | train_runtime, eval_accuracy & eval_loss |
+| pytorch-transfromers-test-2-ddp     | test bert finetuning using BERT from transformer lib+ PT DPP      | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
+| pytorch-transfromers-test-2-smd     | test bert finetuning using BERT from transformer lib+ PT SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
+| pytorch-transfromers-test-1-smp     | test roberta finetuning using BERT from transformer lib+ PT SM MP | SageMaker createTrainingJob | 8     | train_runtime, eval_accuracy & eval_loss |
+| tensorflow-transfromers-test-single | Test bert finetuning using BERT from transformer lib+TF           | SageMaker createTrainingJob | 1     | train_runtime, eval_accuracy & eval_loss |
+| tensorflow-transfromers-test-2-smd  | test bert finetuning using BERT from transformer lib+ TF SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
diff --git a/sagemaker/__init__.py b/sagemaker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecda04614d4218221a399640932b6ecf4f9b60f2
--- /dev/null
+++ b/sagemaker/__init__.py
@@ -0,0 +1,5 @@
+import importlib
+
+
+def is_sagemaker_available():
+    return importlib.util.find_spec("sagemaker") is not None
diff --git a/sagemaker/__pycache__/__init__.cpython-38.pyc b/sagemaker/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa71ed282cc6a38fce8837f245b517cae28984da
Binary files /dev/null and b/sagemaker/__pycache__/__init__.cpython-38.pyc differ
diff --git a/sagemaker/__pycache__/conftest.cpython-38-pytest-6.2.2.pyc b/sagemaker/__pycache__/conftest.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26839106f534a1241bdaaff730b82c8f202a0b44
Binary files /dev/null and b/sagemaker/__pycache__/conftest.cpython-38-pytest-6.2.2.pyc differ
diff --git a/sagemaker/__pycache__/conftest.cpython-38-pytest-6.2.4.pyc b/sagemaker/__pycache__/conftest.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..801052c5b541e4bd86ef5c5c1f1add8fab083589
Binary files /dev/null and b/sagemaker/__pycache__/conftest.cpython-38-pytest-6.2.4.pyc differ
diff --git a/sagemaker/__pycache__/test_multi_node_data_parallel.cpython-38-pytest-6.2.2.pyc b/sagemaker/__pycache__/test_multi_node_data_parallel.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15445849112ffd4e364fbcc8d2c7be7cecaee239
Binary files /dev/null and b/sagemaker/__pycache__/test_multi_node_data_parallel.cpython-38-pytest-6.2.2.pyc differ
diff --git a/sagemaker/__pycache__/test_multi_node_data_parallel.cpython-38-pytest-6.2.4.pyc b/sagemaker/__pycache__/test_multi_node_data_parallel.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce5e9b18ad7feeb606d03e05ab0e5f8ccb5985dd
Binary files /dev/null and b/sagemaker/__pycache__/test_multi_node_data_parallel.cpython-38-pytest-6.2.4.pyc differ
diff --git a/sagemaker/__pycache__/test_multi_node_model_parallel.cpython-38-pytest-6.2.2.pyc b/sagemaker/__pycache__/test_multi_node_model_parallel.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1085c1a102d1b711f22b298e31a04b781998f9d2
Binary files /dev/null and b/sagemaker/__pycache__/test_multi_node_model_parallel.cpython-38-pytest-6.2.2.pyc differ
diff --git a/sagemaker/__pycache__/test_multi_node_model_parallel.cpython-38-pytest-6.2.4.pyc b/sagemaker/__pycache__/test_multi_node_model_parallel.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c585288ca95f8d371a493894c961d5ddf9d9406
Binary files /dev/null and b/sagemaker/__pycache__/test_multi_node_model_parallel.cpython-38-pytest-6.2.4.pyc differ
diff --git a/sagemaker/__pycache__/test_single_node_gpu.cpython-38-pytest-6.2.2.pyc b/sagemaker/__pycache__/test_single_node_gpu.cpython-38-pytest-6.2.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..682b7da5f7b4e950842e06d27aa5433c7bed1ca6
Binary files /dev/null and b/sagemaker/__pycache__/test_single_node_gpu.cpython-38-pytest-6.2.2.pyc differ
diff --git a/sagemaker/__pycache__/test_single_node_gpu.cpython-38-pytest-6.2.4.pyc b/sagemaker/__pycache__/test_single_node_gpu.cpython-38-pytest-6.2.4.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bad27156e47151a0c8e5c32eb4f5e2a47e01881
Binary files /dev/null and b/sagemaker/__pycache__/test_single_node_gpu.cpython-38-pytest-6.2.4.pyc differ
diff --git a/sagemaker/conftest.py b/sagemaker/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..076e06784bc1db8cbee5bcecc1dd46d20a5c5c88
--- /dev/null
+++ b/sagemaker/conftest.py
@@ -0,0 +1,65 @@
+# we define a fixture function below and it will be "used" by
+# referencing its name from tests
+
+import os
+
+import pytest
+
+from attr import dataclass
+
+
+os.environ["AWS_DEFAULT_REGION"] = "us-east-1"  # defaults region
+
+
+@dataclass
+class SageMakerTestEnvironment:
+    framework: str
+    role = "arn:aws:iam::558105141721:role/sagemaker_execution_role"
+    hyperparameters = {
+        "task_name": "mnli",
+        "per_device_train_batch_size": 32,
+        "per_device_eval_batch_size": 32,
+        "do_train": True,
+        "do_eval": True,
+        "do_predict": True,
+        "output_dir": "/opt/ml/model",
+        "overwrite_output_dir": True,
+        "max_steps": 500,
+        "save_steps": 5500,
+    }
+    distributed_hyperparameters = {**hyperparameters, "max_steps": 1000}
+
+    @property
+    def metric_definitions(self) -> str:
+        if self.framework == "pytorch":
+            return [
+                {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
+                {"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
+                {"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
+            ]
+        else:
+            return [
+                {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
+                {"Name": "eval_accuracy", "Regex": "loss.*=\D*(.*?)]?$"},
+                {"Name": "eval_loss", "Regex": "sparse_categorical_accuracy.*=\D*(.*?)]?$"},
+            ]
+
+    @property
+    def base_job_name(self) -> str:
+        return f"{self.framework}-transfromers-test"
+
+    @property
+    def test_path(self) -> str:
+        return f"./tests/sagemaker/scripts/{self.framework}"
+
+    @property
+    def image_uri(self) -> str:
+        if self.framework == "pytorch":
+            return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04"
+        else:
+            return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04"
+
+
+@pytest.fixture(scope="class")
+def sm_env(request):
+    request.cls.env = SageMakerTestEnvironment(framework=request.cls.framework)
diff --git a/sagemaker/scripts/pytorch/requirements.txt b/sagemaker/scripts/pytorch/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0194b67c403dedd97a84e1d64b2990d11993906d
--- /dev/null
+++ b/sagemaker/scripts/pytorch/requirements.txt
@@ -0,0 +1 @@
+git+https://github.com/huggingface/transformers.git@master # install master or adjust ist with vX.X.X for installing version specific transforms
\ No newline at end of file
diff --git a/sagemaker/scripts/pytorch/run_ddp.py b/sagemaker/scripts/pytorch/run_ddp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1191caeb96a29fab1bf11610ece865d73bf512a5
--- /dev/null
+++ b/sagemaker/scripts/pytorch/run_ddp.py
@@ -0,0 +1,52 @@
+import json
+import logging
+import os
+import subprocess
+from argparse import ArgumentParser
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parsed, unknown = parser.parse_known_args()
+    for arg in unknown:
+        if arg.startswith(("-", "--")):
+            parser.add_argument(arg.split("=")[0])
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    port = 8888
+    num_gpus = int(os.environ["SM_NUM_GPUS"])
+    hosts = json.loads(os.environ["SM_HOSTS"])
+    num_nodes = len(hosts)
+    current_host = os.environ["SM_CURRENT_HOST"]
+    rank = hosts.index(current_host)
+    os.environ["NCCL_DEBUG"] = "INFO"
+
+    if num_nodes > 1:
+        cmd = f"""python -m torch.distributed.launch \
+                --nnodes={num_nodes}  \
+                --node_rank={rank}  \
+                --nproc_per_node={num_gpus}  \
+                --master_addr={hosts[0]}  \
+                --master_port={port} \
+                ./run_glue.py \
+                {"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
+    else:
+        cmd = f"""python -m torch.distributed.launch \
+            --nproc_per_node={num_gpus}  \
+            ./run_glue.py \
+            {"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
+    try:
+        subprocess.run(cmd, shell=True)
+    except Exception as e:
+        logger.info(e)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
new file mode 100644
index 0000000000000000000000000000000000000000..2021392930d8fd7657db5a4bc0f5869d06b129e5
--- /dev/null
+++ b/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -0,0 +1,529 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE."""
+# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
+
+import logging
+import os
+import random
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from datasets import load_dataset, load_metric
+
+import transformers
+from transformers import (  # Trainer,; TrainingArguments,
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PretrainedConfig,
+    default_data_collator,
+    set_seed,
+)
+
+# Will import SageMaker Model parallelism specific Trainer
+from transformers.sagemaker import SageMakerTrainer as Trainer
+from transformers.sagemaker import SageMakerTrainingArguments as TrainingArguments
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.4.2")
+
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    task_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_val_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "value if set."
+        },
+    )
+    max_test_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "value if set."
+        },
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the training data."}
+    )
+    validation_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the validation data."}
+    )
+    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
+
+    def __post_init__(self):
+        if self.task_name is not None:
+            self.task_name = self.task_name.lower()
+            if self.task_name not in task_to_keys.keys():
+                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
+        elif self.train_file is None or self.validation_file is None:
+            raise ValueError("Need either a GLUE task or a training/validation file.")
+        else:
+            train_extension = self.train_file.split(".")[-1]
+            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            validation_extension = self.validation_file.split(".")[-1]
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if training_args.should_log:
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
+    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
+    # label if at least two columns are provided.
+    #
+    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
+    # single column. You can easily tweak this behavior (see below)
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.task_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset("glue", data_args.task_name)
+    else:
+        # Loading a dataset from your local files.
+        # CSV/JSON training and evaluation files are needed.
+        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
+
+        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
+        # when you use `do_predict` without specifying a GLUE benchmark task.
+        if training_args.do_predict:
+            if data_args.test_file is not None:
+                train_extension = data_args.train_file.split(".")[-1]
+                test_extension = data_args.test_file.split(".")[-1]
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                data_files["test"] = data_args.test_file
+            else:
+                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
+
+        for key in data_files.keys():
+            logger.info(f"load a local file for {key}: {data_files[key]}")
+
+        if data_args.train_file.endswith(".csv"):
+            # Loading a dataset from local csv files
+            datasets = load_dataset("csv", data_files=data_files)
+        else:
+            # Loading a dataset from local json files
+            datasets = load_dataset("json", data_files=data_files)
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Labels
+    if data_args.task_name is not None:
+        is_regression = data_args.task_name == "stsb"
+        if not is_regression:
+            label_list = datasets["train"].features["label"].names
+            num_labels = len(label_list)
+        else:
+            num_labels = 1
+    else:
+        # Trying to have good defaults here, don't hesitate to tweak to your needs.
+        is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
+        if is_regression:
+            num_labels = 1
+        else:
+            # A useful fast method:
+            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
+            label_list = datasets["train"].unique("label")
+            label_list.sort()  # Let's sort it for determinism
+            num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Preprocessing the datasets
+    if data_args.task_name is not None:
+        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
+    else:
+        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
+        non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
+        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
+            sentence1_key, sentence2_key = "sentence1", "sentence2"
+        else:
+            if len(non_label_column_names) >= 2:
+                sentence1_key, sentence2_key = non_label_column_names[:2]
+            else:
+                sentence1_key, sentence2_key = non_label_column_names[0], None
+
+    # Padding strategy
+    if data_args.pad_to_max_length:
+        padding = "max_length"
+    else:
+        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
+        padding = False
+
+    # Some models have set the order of the labels to use, so let's make sure we do use it.
+    label_to_id = None
+    if (
+        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
+        and data_args.task_name is not None
+        and not is_regression
+    ):
+        # Some have all caps in their config, some don't.
+        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
+        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
+        else:
+            logger.warning(
+                "Your model seems to have been trained with labels, but they don't match the dataset: ",
+                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                "\nIgnoring the model labels as a result.",
+            )
+    elif data_args.task_name is None and not is_regression:
+        label_to_id = {v: i for i, v in enumerate(label_list)}
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def preprocess_function(examples):
+        # Tokenize the texts
+        args = (
+            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
+        )
+        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
+
+        # Map labels to IDs (not necessary for GLUE tasks)
+        if label_to_id is not None and "label" in examples:
+            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
+        return result
+
+    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
+    if training_args.do_train:
+        if "train" not in datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in datasets and "validation_matched" not in datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
+        if data_args.max_val_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+
+    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
+        if "test" not in datasets and "test_matched" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"]
+        if data_args.max_test_samples is not None:
+            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+
+    # Log a few random samples from the training set:
+    if training_args.do_train:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Get the metric function
+    if data_args.task_name is not None:
+        metric = load_metric("glue", data_args.task_name)
+    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
+    # compute_metrics
+
+    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+    # predictions and label_ids field) and has to return a dictionary string to float.
+    def compute_metrics(p: EvalPrediction):
+        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
+        if data_args.task_name is not None:
+            result = metric.compute(predictions=preds, references=p.label_ids)
+            if len(result) > 1:
+                result["combined_score"] = np.mean(list(result.values())).item()
+            return result
+        elif is_regression:
+            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
+        else:
+            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+
+    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    elif training_args.fp16:
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+    else:
+        data_collator = None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            # Check the config from that potential checkpoint has the right number of labels before using it as a
+            # checkpoint.
+            if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels:
+                checkpoint = model_args.model_name_or_path
+
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        tasks = [data_args.task_name]
+        eval_datasets = [eval_dataset]
+        if data_args.task_name == "mnli":
+            tasks.append("mnli-mm")
+            eval_datasets.append(datasets["validation_mismatched"])
+
+        for eval_dataset, task in zip(eval_datasets, tasks):
+            metrics = trainer.evaluate(eval_dataset=eval_dataset)
+
+            max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
+            metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+
+            trainer.log_metrics("eval", metrics)
+            trainer.save_metrics("eval", metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Test ***")
+
+        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        tasks = [data_args.task_name]
+        test_datasets = [test_dataset]
+        if data_args.task_name == "mnli":
+            tasks.append("mnli-mm")
+            test_datasets.append(datasets["test_mismatched"])
+
+        for test_dataset, task in zip(test_datasets, tasks):
+            # Removing the `label` columns because it contains -1 and Trainer won't like that.
+            test_dataset.remove_columns_("label")
+            predictions = trainer.predict(test_dataset=test_dataset).predictions
+            predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
+
+            output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt")
+            if trainer.is_world_process_zero():
+                with open(output_test_file, "w") as writer:
+                    logger.info(f"***** Test results {task} *****")
+                    writer.write("index\tprediction\n")
+                    for index, item in enumerate(predictions):
+                        if is_regression:
+                            writer.write(f"{index}\t{item:3.3f}\n")
+                        else:
+                            item = label_list[item]
+                            writer.write(f"{index}\t{item}\n")
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sagemaker/scripts/tensorflow/requirements.txt b/sagemaker/scripts/tensorflow/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0194b67c403dedd97a84e1d64b2990d11993906d
--- /dev/null
+++ b/sagemaker/scripts/tensorflow/requirements.txt
@@ -0,0 +1 @@
+git+https://github.com/huggingface/transformers.git@master # install master or adjust ist with vX.X.X for installing version specific transforms
\ No newline at end of file
diff --git a/sagemaker/scripts/tensorflow/run_tf.py b/sagemaker/scripts/tensorflow/run_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47e76c09d6125717b034d3ec976c337c08779bd
--- /dev/null
+++ b/sagemaker/scripts/tensorflow/run_tf.py
@@ -0,0 +1,91 @@
+import argparse
+import logging
+import sys
+import time
+
+import tensorflow as tf
+from datasets import load_dataset
+
+from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    # Hyperparameters sent by the client are passed as command-line arguments to the script.
+    parser.add_argument("--epochs", type=int, default=1)
+    parser.add_argument("--per_device_train_batch_size", type=int, default=16)
+    parser.add_argument("--per_device_eval_batch_size", type=int, default=8)
+    parser.add_argument("--model_name_or_path", type=str)
+    parser.add_argument("--learning_rate", type=str, default=5e-5)
+    parser.add_argument("--do_train", type=bool, default=True)
+    parser.add_argument("--do_eval", type=bool, default=True)
+    parser.add_argument("--output_dir", type=str)
+
+    args, _ = parser.parse_known_args()
+
+    # overwrite batch size until we have tf_glue.py
+    args.per_device_train_batch_size = 16
+    args.per_device_eval_batch_size = 16
+
+    # Set up logging
+    logger = logging.getLogger(__name__)
+
+    logging.basicConfig(
+        level=logging.getLevelName("INFO"),
+        handlers=[logging.StreamHandler(sys.stdout)],
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+
+    # Load model and tokenizer
+    model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    # Load dataset
+    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
+    train_dataset = train_dataset.shuffle().select(range(5000))  # smaller the size for train dataset to 5k
+    test_dataset = test_dataset.shuffle().select(range(500))  # smaller the size for test dataset to 500
+
+    # Preprocess train dataset
+    train_dataset = train_dataset.map(
+        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
+    )
+    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
+
+    train_features = {
+        x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
+        for x in ["input_ids", "attention_mask"]
+    }
+    tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"])).batch(
+        args.per_device_train_batch_size
+    )
+
+    # Preprocess test dataset
+    test_dataset = test_dataset.map(
+        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
+    )
+    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
+
+    test_features = {
+        x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
+        for x in ["input_ids", "attention_mask"]
+    }
+    tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"])).batch(
+        args.per_device_eval_batch_size
+    )
+
+    # fine optimizer and loss
+    optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
+    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+
+    start_train_time = time.time()
+    train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.per_device_train_batch_size)
+    end_train_time = time.time() - start_train_time
+
+    logger.info("*** Train ***")
+    logger.info(f"train_runtime = {end_train_time}")
+    for key, value in train_results.history.items():
+        logger.info(f"  {key} = {value}")
diff --git a/sagemaker/scripts/tensorflow/run_tf_dist.py b/sagemaker/scripts/tensorflow/run_tf_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ff709d037aad50a208632ceb4e545fb73f447f0
--- /dev/null
+++ b/sagemaker/scripts/tensorflow/run_tf_dist.py
@@ -0,0 +1,194 @@
+import argparse
+import logging
+import os
+import sys
+import time
+
+import tensorflow as tf
+from datasets import load_dataset
+from tqdm import tqdm
+
+from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+from transformers.file_utils import is_sagemaker_dp_enabled
+
+
+if os.environ.get("SDP_ENABLED") or is_sagemaker_dp_enabled():
+    SDP_ENABLED = True
+    os.environ["SAGEMAKER_INSTANCE_TYPE"] = "p3dn.24xlarge"
+    import smdistributed.dataparallel.tensorflow as sdp
+else:
+    SDP_ENABLED = False
+
+
+def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=None):
+    pbar = tqdm(train_dataset)
+    for i, batch in enumerate(pbar):
+        with tf.GradientTape() as tape:
+            inputs, targets = batch
+            outputs = model(batch)
+            loss_value = loss(targets, outputs.logits)
+
+        if SDP_ENABLED:
+            tape = sdp.DistributedGradientTape(tape, sparse_as_dense=True)
+
+        grads = tape.gradient(loss_value, model.trainable_variables)
+        opt.apply_gradients(zip(grads, model.trainable_variables))
+
+        pbar.set_description(f"Loss: {loss_value:.4f}")
+
+        if SDP_ENABLED and i == 0:
+            sdp.broadcast_variables(model.variables, root_rank=0)
+            sdp.broadcast_variables(opt.variables(), root_rank=0)
+
+        if max_steps and i >= max_steps:
+            break
+
+    train_results = {"loss": loss_value.numpy()}
+    return train_results
+
+
+def get_datasets(tokenizer, train_batch_size, eval_batch_size):
+    # Load dataset
+    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
+
+    # Preprocess train dataset
+    train_dataset = train_dataset.map(
+        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
+    )
+    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
+
+    train_features = {
+        x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
+        for x in ["input_ids", "attention_mask"]
+    }
+    tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"]))
+
+    # Preprocess test dataset
+    test_dataset = test_dataset.map(
+        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
+    )
+    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
+
+    test_features = {
+        x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
+        for x in ["input_ids", "attention_mask"]
+    }
+    tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"]))
+
+    if SDP_ENABLED:
+        tf_train_dataset = tf_train_dataset.shard(sdp.size(), sdp.rank())
+        tf_test_dataset = tf_test_dataset.shard(sdp.size(), sdp.rank())
+    tf_train_dataset = tf_train_dataset.batch(train_batch_size, drop_remainder=True)
+    tf_test_dataset = tf_test_dataset.batch(eval_batch_size, drop_remainder=True)
+
+    return tf_train_dataset, tf_test_dataset
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    # Hyperparameters sent by the client are passed as command-line arguments to the script.
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--per_device_train_batch_size", type=int, default=16)
+    parser.add_argument("--per_device_eval_batch_size", type=int, default=8)
+    parser.add_argument("--model_name_or_path", type=str)
+    parser.add_argument("--learning_rate", type=str, default=5e-5)
+    parser.add_argument("--do_train", type=bool, default=True)
+    parser.add_argument("--do_eval", type=bool, default=True)
+    parser.add_argument("--output_dir", type=str)
+    parser.add_argument("--max_steps", type=int, default=None)
+
+    # Data, model, and output directories
+    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
+    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
+    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
+
+    args, _ = parser.parse_known_args()
+
+    # Set up logging
+    logger = logging.getLogger(__name__)
+
+    logging.basicConfig(
+        level=logging.getLevelName("INFO"),
+        handlers=[logging.StreamHandler(sys.stdout)],
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+
+    if SDP_ENABLED:
+        sdp.init()
+
+        gpus = tf.config.experimental.list_physical_devices("GPU")
+        for gpu in gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+        if gpus:
+            tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], "GPU")
+
+    # Load model and tokenizer
+    model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    # get datasets
+    tf_train_dataset, tf_test_dataset = get_datasets(
+        tokenizer=tokenizer,
+        train_batch_size=args.per_device_train_batch_size,
+        eval_batch_size=args.per_device_eval_batch_size,
+    )
+
+    # fine optimizer and loss
+    optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
+    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+
+    # Training
+    if args.do_train:
+
+        # train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
+        start_train_time = time.time()
+        train_results = fit(
+            model,
+            loss,
+            optimizer,
+            tf_train_dataset,
+            args.epochs,
+            args.per_device_train_batch_size,
+            max_steps=args.max_steps,
+        )
+        end_train_time = time.time() - start_train_time
+        logger.info("*** Train ***")
+        logger.info(f"train_runtime = {end_train_time}")
+
+        output_eval_file = os.path.join(args.output_dir, "train_results.txt")
+
+        if not SDP_ENABLED or sdp.rank() == 0:
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Train results *****")
+                logger.info(train_results)
+                for key, value in train_results.items():
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    # Evaluation
+    if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0):
+
+        result = model.evaluate(tf_test_dataset, batch_size=args.per_device_eval_batch_size, return_dict=True)
+        logger.info("*** Evaluate ***")
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            logger.info(result)
+            for key, value in result.items():
+                logger.info(f"  {key} = {value}")
+                writer.write(f"{key} = {value}\n")
+
+    # Save result
+    if SDP_ENABLED:
+        if sdp.rank() == 0:
+            model.save_pretrained(args.output_dir)
+            tokenizer.save_pretrained(args.output_dir)
+    else:
+        model.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
diff --git a/sagemaker/test_multi_node_data_parallel.py b/sagemaker/test_multi_node_data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0488e4fcf8c51843a3d62fdf711983ce793a98b4
--- /dev/null
+++ b/sagemaker/test_multi_node_data_parallel.py
@@ -0,0 +1,110 @@
+import json
+import os
+import subprocess
+import unittest
+from ast import literal_eval
+
+import pytest
+
+from parameterized import parameterized, parameterized_class
+
+from . import is_sagemaker_available
+
+
+if is_sagemaker_available():
+    from sagemaker import Session, TrainingJobAnalytics
+    from sagemaker.huggingface import HuggingFace
+
+
+@pytest.mark.skipif(
+    literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
+    reason="Skipping test because should only be run when releasing minor transformers version",
+)
+@pytest.mark.usefixtures("sm_env")
+@parameterized_class(
+    [
+        {
+            "framework": "pytorch",
+            "script": "run_glue.py",
+            "model_name_or_path": "distilbert-base-cased",
+            "instance_type": "ml.p3dn.24xlarge",
+            "results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
+        },
+        {
+            "framework": "pytorch",
+            "script": "run_ddp.py",
+            "model_name_or_path": "distilbert-base-cased",
+            "instance_type": "ml.p3dn.24xlarge",
+            "results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
+        },
+        {
+            "framework": "tensorflow",
+            "script": "run_tf_dist.py",
+            "model_name_or_path": "distilbert-base-cased",
+            "instance_type": "ml.p3dn.24xlarge",
+            "results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
+        },
+    ]
+)
+class MultiNodeTest(unittest.TestCase):
+    def setUp(self):
+        if self.framework == "pytorch":
+            subprocess.run(
+                f"cp ./examples/pytorch/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
+                encoding="utf-8",
+                check=True,
+            )
+        assert hasattr(self, "env")
+
+    def create_estimator(self, instance_count):
+        job_name = f"{self.env.base_job_name}-{instance_count}-{'ddp' if 'ddp' in self.script else 'smd'}"
+        # distributed data settings
+        distribution = {"smdistributed": {"dataparallel": {"enabled": True}}} if self.script != "run_ddp.py" else None
+
+        # creates estimator
+        return HuggingFace(
+            entry_point=self.script,
+            source_dir=self.env.test_path,
+            role=self.env.role,
+            image_uri=self.env.image_uri,
+            base_job_name=job_name,
+            instance_count=instance_count,
+            instance_type=self.instance_type,
+            debugger_hook_config=False,
+            hyperparameters={**self.env.distributed_hyperparameters, "model_name_or_path": self.model_name_or_path},
+            metric_definitions=self.env.metric_definitions,
+            distribution=distribution,
+            py_version="py36",
+        )
+
+    def save_results_as_csv(self, job_name):
+        TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
+
+    # @parameterized.expand([(2,), (4,),])
+    @parameterized.expand([(2,)])
+    def test_script(self, instance_count):
+        # create estimator
+        estimator = self.create_estimator(instance_count)
+
+        # run training
+        estimator.fit()
+
+        # result dataframe
+        result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
+
+        # extract kpis
+        eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
+        eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
+        # get train time from SageMaker job, this includes starting, preprocessing, stopping
+        train_runtime = (
+            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
+        )
+
+        # assert kpis
+        assert train_runtime <= self.results["train_runtime"]
+        assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
+        assert all(t <= self.results["eval_loss"] for t in eval_loss)
+
+        # dump tests result into json file to share in PR
+        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
+            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
diff --git a/sagemaker/test_multi_node_model_parallel.py b/sagemaker/test_multi_node_model_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a1c9a6b3b7bda68c033845759f318980dac35f
--- /dev/null
+++ b/sagemaker/test_multi_node_model_parallel.py
@@ -0,0 +1,124 @@
+import json
+import os
+import subprocess
+import unittest
+from ast import literal_eval
+
+import pytest
+
+from parameterized import parameterized, parameterized_class
+
+from . import is_sagemaker_available
+
+
+if is_sagemaker_available():
+    from sagemaker import Session, TrainingJobAnalytics
+    from sagemaker.huggingface import HuggingFace
+
+
+@pytest.mark.skipif(
+    literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
+    reason="Skipping test because should only be run when releasing minor transformers version",
+)
+@pytest.mark.usefixtures("sm_env")
+@parameterized_class(
+    [
+        {
+            "framework": "pytorch",
+            "script": "run_glue_model_parallelism.py",
+            "model_name_or_path": "roberta-large",
+            "instance_type": "ml.p3dn.24xlarge",
+            "results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},
+        },
+        {
+            "framework": "pytorch",
+            "script": "run_glue.py",
+            "model_name_or_path": "roberta-large",
+            "instance_type": "ml.p3dn.24xlarge",
+            "results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},
+        },
+    ]
+)
+class MultiNodeTest(unittest.TestCase):
+    def setUp(self):
+        if self.framework == "pytorch":
+            subprocess.run(
+                f"cp ./examples/pytorch/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
+                encoding="utf-8",
+                check=True,
+            )
+        assert hasattr(self, "env")
+
+    def create_estimator(self, instance_count):
+
+        # configuration for running training on smdistributed Model Parallel
+        mpi_options = {
+            "enabled": True,
+            "processes_per_host": 8,
+        }
+        smp_options = {
+            "enabled": True,
+            "parameters": {
+                "microbatches": 4,
+                "placement_strategy": "spread",
+                "pipeline": "interleaved",
+                "optimize": "speed",
+                "partitions": 4,
+                "ddp": True,
+            },
+        }
+
+        distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
+
+        name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"
+        # creates estimator
+        return HuggingFace(
+            entry_point=self.script,
+            source_dir=self.env.test_path,
+            role=self.env.role,
+            image_uri=self.env.image_uri,
+            base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",
+            instance_count=instance_count,
+            instance_type=self.instance_type,
+            debugger_hook_config=False,
+            hyperparameters={
+                **self.env.hyperparameters,
+                "model_name_or_path": self.model_name_or_path,
+                "max_steps": 500,
+            },
+            metric_definitions=self.env.metric_definitions,
+            distribution=distribution,
+            py_version="py36",
+        )
+
+    def save_results_as_csv(self, job_name):
+        TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
+
+    # @parameterized.expand([(2,), (4,),])
+    @parameterized.expand([(1,)])
+    def test_scripz(self, instance_count):
+        # create estimator
+        estimator = self.create_estimator(instance_count)
+
+        # run training
+        estimator.fit()
+
+        # result dataframe
+        result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
+
+        # extract kpis
+        eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
+        eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
+        # get train time from SageMaker job, this includes starting, preprocessing, stopping
+        train_runtime = (
+            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
+        )
+
+        # assert kpis
+        assert train_runtime <= self.results["train_runtime"]
+        assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
+        assert all(t <= self.results["eval_loss"] for t in eval_loss)
+
+        # dump tests result into json file to share in PR
+        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
+            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
diff --git a/sagemaker/test_single_node_gpu.py b/sagemaker/test_single_node_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e71f82d31634e0bd046c5875dad25d2901b8ec89
--- /dev/null
+++ b/sagemaker/test_single_node_gpu.py
@@ -0,0 +1,96 @@
+import json
+import os
+import subprocess
+import unittest
+from ast import literal_eval
+
+import pytest
+
+from parameterized import parameterized_class
+
+from . import is_sagemaker_available
+
+
+if is_sagemaker_available():
+    from sagemaker import Session, TrainingJobAnalytics
+    from sagemaker.huggingface import HuggingFace
+
+
+@pytest.mark.skipif(
+    literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
+    reason="Skipping test because should only be run when releasing minor transformers version",
+)
+@pytest.mark.usefixtures("sm_env")
+@parameterized_class(
+    [
+        {
+            "framework": "pytorch",
+            "script": "run_glue.py",
+            "model_name_or_path": "distilbert-base-cased",
+            "instance_type": "ml.g4dn.xlarge",
+            "results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9},
+        },
+        {
+            "framework": "tensorflow",
+            "script": "run_tf.py",
+            "model_name_or_path": "distilbert-base-cased",
+            "instance_type": "ml.g4dn.xlarge",
+            "results": {"train_runtime": 600, "eval_accuracy": 0.3, "eval_loss": 0.9},
+        },
+    ]
+)
+class SingleNodeTest(unittest.TestCase):
+    def setUp(self):
+        if self.framework == "pytorch":
+            subprocess.run(
+                f"cp ./examples/pytorch/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
+                encoding="utf-8",
+                check=True,
+            )
+        assert hasattr(self, "env")
+
+    def create_estimator(self, instance_count=1):
+        # creates estimator
+        return HuggingFace(
+            entry_point=self.script,
+            source_dir=self.env.test_path,
+            role=self.env.role,
+            image_uri=self.env.image_uri,
+            base_job_name=f"{self.env.base_job_name}-single",
+            instance_count=instance_count,
+            instance_type=self.instance_type,
+            debugger_hook_config=False,
+            hyperparameters={**self.env.hyperparameters, "model_name_or_path": self.model_name_or_path},
+            metric_definitions=self.env.metric_definitions,
+            py_version="py36",
+        )
+
+    def save_results_as_csv(self, job_name):
+        TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
+
+    def test_glue(self):
+        # create estimator
+        estimator = self.create_estimator()
+
+        # run training
+        estimator.fit()
+
+        # result dataframe
+        result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
+
+        # extract kpis
+        eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
+        eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
+        # get train time from SageMaker job, this includes starting, preprocessing, stopping
+        train_runtime = (
+            Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
+        )
+
+        # assert kpis
+        assert train_runtime <= self.results["train_runtime"]
+        assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
+        assert all(t <= self.results["eval_loss"] for t in eval_loss)
+
+        # dump tests result into json file to share in PR
+        with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
+            json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
diff --git a/test_activations.py b/test_activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe15caf819e6ecb748c63c839bb6dc9eedd810a7
--- /dev/null
+++ b/test_activations.py
@@ -0,0 +1,45 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch
+
+
+if is_torch_available():
+    import torch
+
+    from transformers.activations import _gelu_python, gelu_new, get_activation
+
+
+@require_torch
+class TestActivations(unittest.TestCase):
+    def test_gelu_versions(self):
+        x = torch.tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100])
+        torch_builtin = get_activation("gelu")
+        self.assertTrue(torch.eq(_gelu_python(x), torch_builtin(x)).all().item())
+        self.assertFalse(torch.eq(_gelu_python(x), gelu_new(x)).all().item())
+
+    def test_get_activation(self):
+        get_activation("swish")
+        get_activation("silu")
+        get_activation("relu")
+        get_activation("tanh")
+        get_activation("gelu_new")
+        get_activation("gelu_fast")
+        with self.assertRaises(KeyError):
+            get_activation("bogus")
+        with self.assertRaises(KeyError):
+            get_activation(None)
diff --git a/test_activations_tf.py b/test_activations_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9ef2e4cea9a398459979c13d9966c33e094891
--- /dev/null
+++ b/test_activations_tf.py
@@ -0,0 +1,39 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf
+
+
+if is_tf_available():
+    from transformers.activations_tf import get_tf_activation
+
+
+@require_tf
+class TestTFActivations(unittest.TestCase):
+    def test_get_activation(self):
+        get_tf_activation("swish")
+        get_tf_activation("silu")
+        get_tf_activation("gelu")
+        get_tf_activation("relu")
+        get_tf_activation("tanh")
+        get_tf_activation("gelu_new")
+        get_tf_activation("gelu_fast")
+        get_tf_activation("mish")
+        with self.assertRaises(KeyError):
+            get_tf_activation("bogus")
+        with self.assertRaises(KeyError):
+            get_tf_activation(None)
diff --git a/test_benchmark.py b/test_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..359efba8bb5aa2df41f3116bacdf5e101b252c3f
--- /dev/null
+++ b/test_benchmark.py
@@ -0,0 +1,264 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from transformers import AutoConfig, is_torch_available
+from transformers.testing_utils import require_torch, torch_device
+
+
+if is_torch_available():
+    from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
+
+
+@require_torch
+class BenchmarkTest(unittest.TestCase):
+    def check_results_dict_not_empty(self, results):
+        for model_result in results.values():
+            for batch_size, sequence_length in zip(model_result["bs"], model_result["ss"]):
+                result = model_result["result"][batch_size][sequence_length]
+                self.assertIsNotNone(result)
+
+    def test_inference_no_configs(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_configs_only_pretrain(self):
+        MODEL_ID = "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+            only_pretrain_model=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_torchscript(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            torchscript=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_inference_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            fp16=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_model_no_architectures(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        # set architectures equal to `None`
+        config.architectures = None
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_train_no_configs(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+    def test_train_no_configs_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            fp16=True,
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    def test_inference_with_configs(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_encoder_decoder_with_configs(self):
+        MODEL_ID = "sshleifer/tinier_bart"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_train_with_configs(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    def test_train_encoder_decoder_with_configs(self):
+        MODEL_ID = "sshleifer/tinier_bart"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    def test_save_csv_files(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = PyTorchBenchmarkArguments(
+                models=[MODEL_ID],
+                training=True,
+                inference=True,
+                save_to_csv=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                inference_time_csv_file=os.path.join(tmp_dir, "inf_time.csv"),
+                train_memory_csv_file=os.path.join(tmp_dir, "train_mem.csv"),
+                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
+                train_time_csv_file=os.path.join(tmp_dir, "train_time.csv"),
+                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                multi_process=False,
+            )
+            benchmark = PyTorchBenchmark(benchmark_args)
+            benchmark.run()
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_time.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "train_time.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_mem.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "train_mem.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "env.csv")).exists())
+
+    def test_trace_memory(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+
+        def _check_summary_is_not_empty(summary):
+            self.assertTrue(hasattr(summary, "sequential"))
+            self.assertTrue(hasattr(summary, "cumulative"))
+            self.assertTrue(hasattr(summary, "current"))
+            self.assertTrue(hasattr(summary, "total"))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = PyTorchBenchmarkArguments(
+                models=[MODEL_ID],
+                training=True,
+                inference=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                log_filename=os.path.join(tmp_dir, "log.txt"),
+                log_print=True,
+                trace_memory_line_by_line=True,
+                multi_process=False,
+            )
+            benchmark = PyTorchBenchmark(benchmark_args)
+            result = benchmark.run()
+            _check_summary_is_not_empty(result.inference_summary)
+            _check_summary_is_not_empty(result.train_summary)
+            self.assertTrue(Path(os.path.join(tmp_dir, "log.txt")).exists())
diff --git a/test_benchmark_tf.py b/test_benchmark_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd72e09d0b5a304c474963fd75181bfdaf06377
--- /dev/null
+++ b/test_benchmark_tf.py
@@ -0,0 +1,226 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from transformers import AutoConfig, is_tf_available
+from transformers.testing_utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
+
+
+@require_tf
+class TFBenchmarkTest(unittest.TestCase):
+    def check_results_dict_not_empty(self, results):
+        for model_result in results.values():
+            for batch_size, sequence_length in zip(model_result["bs"], model_result["ss"]):
+                result = model_result["result"][batch_size][sequence_length]
+                self.assertIsNotNone(result)
+
+    def test_inference_no_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_configs_only_pretrain(self):
+        MODEL_ID = "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+            only_pretrain_model=True,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_train_no_configs(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    def test_train_with_configs(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    def test_inference_encoder_decoder_with_configs(self):
+        MODEL_ID = "patrickvonplaten/t5-tiny-random"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(is_tf_available() and len(tf.config.list_physical_devices("GPU")) == 0, "Cannot do xla on CPU.")
+    def test_inference_no_configs_xla(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorFlowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            use_xla=True,
+            multi_process=False,
+        )
+        benchmark = TensorFlowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_save_csv_files(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorFlowBenchmarkArguments(
+                models=[MODEL_ID],
+                inference=True,
+                save_to_csv=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                inference_time_csv_file=os.path.join(tmp_dir, "inf_time.csv"),
+                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
+                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                multi_process=False,
+            )
+            benchmark = TensorFlowBenchmark(benchmark_args)
+            benchmark.run()
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_time.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_mem.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "env.csv")).exists())
+
+    def test_trace_memory(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+
+        def _check_summary_is_not_empty(summary):
+            self.assertTrue(hasattr(summary, "sequential"))
+            self.assertTrue(hasattr(summary, "cumulative"))
+            self.assertTrue(hasattr(summary, "current"))
+            self.assertTrue(hasattr(summary, "total"))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorFlowBenchmarkArguments(
+                models=[MODEL_ID],
+                inference=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                log_filename=os.path.join(tmp_dir, "log.txt"),
+                log_print=True,
+                trace_memory_line_by_line=True,
+                eager_mode=True,
+                multi_process=False,
+            )
+            benchmark = TensorFlowBenchmark(benchmark_args)
+            result = benchmark.run()
+            _check_summary_is_not_empty(result.inference_summary)
+            self.assertTrue(Path(os.path.join(tmp_dir, "log.txt")).exists())
diff --git a/test_cli.py b/test_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a535140a59358e6f02cfd595461833269e2375
--- /dev/null
+++ b/test_cli.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest.mock import patch
+
+from transformers.testing_utils import CaptureStd
+
+
+class CLITest(unittest.TestCase):
+    @patch("sys.argv", ["fakeprogrampath", "env"])
+    def test_cli_env(self):
+        # test transformers-cli env
+        import transformers.commands.transformers_cli
+
+        with CaptureStd() as cs:
+            transformers.commands.transformers_cli.main()
+        assert "Python version" in cs.out
+        assert "Platform" in cs.out
+        assert "Using distributed or parallel set-up in script?" in cs.out
diff --git a/test_configuration_auto.py b/test_configuration_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac9a755a7c340856ef00cb41fdd0419b40967d41
--- /dev/null
+++ b/test_configuration_auto.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.roberta.configuration_roberta import RobertaConfig
+from transformers.testing_utils import DUMMY_UNKWOWN_IDENTIFIER
+
+
+SAMPLE_ROBERTA_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy-config.json")
+
+
+class AutoConfigTest(unittest.TestCase):
+    def test_config_from_model_shortcut(self):
+        config = AutoConfig.from_pretrained("bert-base-uncased")
+        self.assertIsInstance(config, BertConfig)
+
+    def test_config_model_type_from_local_file(self):
+        config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
+        self.assertIsInstance(config, RobertaConfig)
+
+    def test_config_model_type_from_model_identifier(self):
+        config = AutoConfig.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
+        self.assertIsInstance(config, RobertaConfig)
+
+    def test_config_for_model_str(self):
+        config = AutoConfig.for_model("roberta")
+        self.assertIsInstance(config, RobertaConfig)
+
+    def test_pattern_matching_fallback(self):
+        """
+        In cases where config.json doesn't include a model_type,
+        perform a few safety checks on the config mapping's order.
+        """
+        # no key string should be included in a later key string (typical failure case)
+        keys = list(CONFIG_MAPPING.keys())
+        for i, key in enumerate(keys):
+            self.assertFalse(any(key in later_key for later_key in keys[i + 1 :]))
diff --git a/test_configuration_common.py b/test_configuration_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c86d1161d54138ad49df63b01025d8acabe79e
--- /dev/null
+++ b/test_configuration_common.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import tempfile
+import unittest
+
+from huggingface_hub import HfApi
+from requests.exceptions import HTTPError
+from transformers import BertConfig, GPT2Config
+from transformers.testing_utils import ENDPOINT_STAGING, PASS, USER, is_staging_test
+
+
+class ConfigTester(object):
+    def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs):
+        self.parent = parent
+        self.config_class = config_class
+        self.has_text_modality = has_text_modality
+        self.inputs_dict = kwargs
+
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        if self.has_text_modality:
+            self.parent.assertTrue(hasattr(config, "vocab_size"))
+        self.parent.assertTrue(hasattr(config, "hidden_size"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
+
+    def create_and_test_config_to_json_string(self):
+        config = self.config_class(**self.inputs_dict)
+        obj = json.loads(config.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.parent.assertEqual(obj[key], value)
+
+    def create_and_test_config_to_json_file(self):
+        config_first = self.config_class(**self.inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "config.json")
+            config_first.to_json_file(json_file_path)
+            config_second = self.config_class.from_json_file(json_file_path)
+
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def create_and_test_config_from_and_save_pretrained(self):
+        config_first = self.config_class(**self.inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config_first.save_pretrained(tmpdirname)
+            config_second = self.config_class.from_pretrained(tmpdirname)
+
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def create_and_test_config_with_num_labels(self):
+        config = self.config_class(**self.inputs_dict, num_labels=5)
+        self.parent.assertEqual(len(config.id2label), 5)
+        self.parent.assertEqual(len(config.label2id), 5)
+
+        config.num_labels = 3
+        self.parent.assertEqual(len(config.id2label), 3)
+        self.parent.assertEqual(len(config.label2id), 3)
+
+    def check_config_can_be_init_without_params(self):
+        if self.config_class.is_composition:
+            return
+        config = self.config_class()
+        self.parent.assertIsNotNone(config)
+
+    def run_common_tests(self):
+        self.create_and_test_config_common_properties()
+        self.create_and_test_config_to_json_string()
+        self.create_and_test_config_to_json_file()
+        self.create_and_test_config_from_and_save_pretrained()
+        self.create_and_test_config_with_num_labels()
+        self.check_config_can_be_init_without_params()
+
+
+@is_staging_test
+class ConfigPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._api = HfApi(endpoint=ENDPOINT_STAGING)
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-config")
+        except HTTPError:
+            pass
+
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-config-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-config", use_auth_token=self._token)
+
+            new_config = BertConfig.from_pretrained(f"{USER}/test-config")
+            for k, v in config.__dict__.items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config.save_pretrained(
+                tmp_dir,
+                push_to_hub=True,
+                repo_name="test-config-org",
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
+
+            new_config = BertConfig.from_pretrained("valid_org/test-config-org")
+            for k, v in config.__dict__.items():
+                if k != "transformers_version":
+                    self.assertEqual(v, getattr(new_config, k))
+
+
+class ConfigTestUtils(unittest.TestCase):
+    def test_config_from_string(self):
+        c = GPT2Config()
+
+        # attempt to modify each of int/float/bool/str config records and verify they were updated
+        n_embd = c.n_embd + 1  # int
+        resid_pdrop = c.resid_pdrop + 1.0  # float
+        scale_attn_weights = not c.scale_attn_weights  # bool
+        summary_type = c.summary_type + "foo"  # str
+        c.update_from_string(
+            f"n_embd={n_embd},resid_pdrop={resid_pdrop},scale_attn_weights={scale_attn_weights},summary_type={summary_type}"
+        )
+        self.assertEqual(n_embd, c.n_embd, "mismatch for key: n_embd")
+        self.assertEqual(resid_pdrop, c.resid_pdrop, "mismatch for key: resid_pdrop")
+        self.assertEqual(scale_attn_weights, c.scale_attn_weights, "mismatch for key: scale_attn_weights")
+        self.assertEqual(summary_type, c.summary_type, "mismatch for key: summary_type")
diff --git a/test_data_collator.py b/test_data_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9d363229f6e03490a19e123941c6f4ac72872c4
--- /dev/null
+++ b/test_data_collator.py
@@ -0,0 +1,292 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from transformers import BertTokenizer, is_torch_available, set_seed
+from transformers.testing_utils import require_torch
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DataCollatorForLanguageModeling,
+        DataCollatorForPermutationLanguageModeling,
+        DataCollatorForTokenClassification,
+        DataCollatorWithPadding,
+        default_data_collator,
+    )
+
+
+@require_torch
+class DataCollatorIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+        self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_default_with_dict(self):
+        features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
+        batch = default_data_collator(features)
+        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
+        self.assertEqual(batch["labels"].dtype, torch.long)
+        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
+
+        # With label_ids
+        features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
+        batch = default_data_collator(features)
+        self.assertTrue(batch["labels"].equal(torch.tensor([[0, 1, 2]] * 8)))
+        self.assertEqual(batch["labels"].dtype, torch.long)
+        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
+
+        # Features can already be tensors
+        features = [{"label": i, "inputs": torch.randint(10, [10])} for i in range(8)]
+        batch = default_data_collator(features)
+        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
+        self.assertEqual(batch["labels"].dtype, torch.long)
+        self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))
+
+        # Labels can already be tensors
+        features = [{"label": torch.tensor(i), "inputs": torch.randint(10, [10])} for i in range(8)]
+        batch = default_data_collator(features)
+        self.assertEqual(batch["labels"].dtype, torch.long)
+        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
+        self.assertEqual(batch["labels"].dtype, torch.long)
+        self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))
+
+    def test_default_classification_and_regression(self):
+        data_collator = default_data_collator
+
+        features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
+        batch = data_collator(features)
+        self.assertEqual(batch["labels"].dtype, torch.long)
+
+        features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)]
+        batch = data_collator(features)
+        self.assertEqual(batch["labels"].dtype, torch.float)
+
+    def test_default_with_no_labels(self):
+        features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
+        batch = default_data_collator(features)
+        self.assertTrue("labels" not in batch)
+        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
+
+        # With label_ids
+        features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
+        batch = default_data_collator(features)
+        self.assertTrue("labels" not in batch)
+        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
+
+    def test_data_collator_with_padding(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
+
+        data_collator = DataCollatorWithPadding(tokenizer)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 6]))
+        self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
+
+        data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 10]))
+
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 8]))
+
+    def test_data_collator_for_token_classification(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [
+            {"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
+            {"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
+        ]
+
+        data_collator = DataCollatorForTokenClassification(tokenizer)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 6]))
+        self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
+        self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
+        self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-100] * 3)
+
+        data_collator = DataCollatorForTokenClassification(tokenizer, padding="max_length", max_length=10)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 10]))
+        self.assertEqual(batch["labels"].shape, torch.Size([2, 10]))
+
+        data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 8]))
+        self.assertEqual(batch["labels"].shape, torch.Size([2, 8]))
+
+        data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1)
+        batch = data_collator(features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size([2, 6]))
+        self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
+        self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
+        self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
+
+    def _test_no_pad_and_pad(self, no_pad_features, pad_features):
+        tokenizer = BertTokenizer(self.vocab_file)
+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        tokenizer._pad_token = None
+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
+        with self.assertRaises(ValueError):
+            # Expect error due to padding token missing
+            data_collator(pad_features)
+
+        set_seed(42)  # For reproducibility
+        tokenizer = BertTokenizer(self.vocab_file)
+        data_collator = DataCollatorForLanguageModeling(tokenizer)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+    def test_data_collator_for_language_modeling(self):
+        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
+        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
+        no_pad_features = [list(range(10)), list(range(10))]
+        pad_features = [list(range(5)), list(range(10))]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
+    def test_plm(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
+        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
+
+        data_collator = DataCollatorForPermutationLanguageModeling(tokenizer)
+
+        batch = data_collator(pad_features)
+        self.assertIsInstance(batch, dict)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10)))
+        self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 10, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
+
+        batch = data_collator(no_pad_features)
+        self.assertIsInstance(batch, dict)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
+        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10)))
+        self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 10, 10)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
+
+        example = [torch.randint(5, [5])]
+        with self.assertRaises(ValueError):
+            # Expect error due to odd sequence length
+            data_collator(example)
+
+    def test_nsp(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [
+            {"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
+            for i in range(2)
+        ]
+        data_collator = DataCollatorForLanguageModeling(tokenizer)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 5)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
+
+    def test_sop(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [
+            {
+                "input_ids": torch.tensor([0, 1, 2, 3, 4]),
+                "token_type_ids": torch.tensor([0, 1, 2, 3, 4]),
+                "sentence_order_label": i,
+            }
+            for i in range(2)
+        ]
+        data_collator = DataCollatorForLanguageModeling(tokenizer)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 5)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
+        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
diff --git a/test_doc_samples.py b/test_doc_samples.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e945bae9db972618340bacbd281f2173f821237
--- /dev/null
+++ b/test_doc_samples.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import doctest
+import logging
+import os
+import unittest
+from pathlib import Path
+from typing import List, Union
+
+import transformers
+from transformers.testing_utils import require_tf, require_torch, slow
+
+
+logger = logging.getLogger()
+
+
+@unittest.skip("Temporarily disable the doc tests.")
+@require_torch
+@require_tf
+@slow
+class TestCodeExamples(unittest.TestCase):
+    def analyze_directory(
+        self,
+        directory: Path,
+        identifier: Union[str, None] = None,
+        ignore_files: Union[List[str], None] = None,
+        n_identifier: Union[str, List[str], None] = None,
+        only_modules: bool = True,
+    ):
+        """
+        Runs through the specific directory, looking for the files identified with `identifier`. Executes
+        the doctests in those files
+
+        Args:
+            directory (:obj:`Path`): Directory containing the files
+            identifier (:obj:`str`): Will parse files containing this
+            ignore_files (:obj:`List[str]`): List of files to skip
+            n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
+            only_modules (:obj:`bool`): Whether to only analyze modules
+        """
+        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
+
+        if identifier is not None:
+            files = [file for file in files if identifier in file]
+
+        if n_identifier is not None:
+            if isinstance(n_identifier, List):
+                for n_ in n_identifier:
+                    files = [file for file in files if n_ not in file]
+            else:
+                files = [file for file in files if n_identifier not in file]
+
+        ignore_files = ignore_files or []
+        ignore_files.append("__init__.py")
+        files = [file for file in files if file not in ignore_files]
+
+        for file in files:
+            # Open all files
+            print("Testing", file)
+
+            if only_modules:
+                module_identifier = file.split(".")[0]
+                try:
+                    module_identifier = getattr(transformers, module_identifier)
+                    suite = doctest.DocTestSuite(module_identifier)
+                    result = unittest.TextTestRunner().run(suite)
+                    self.assertIs(len(result.failures), 0)
+                except AttributeError:
+                    logger.info(f"{module_identifier} is not a module.")
+            else:
+                result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
+                self.assertIs(result.failed, 0)
+
+    def test_modeling_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "modeling"
+        ignore_files = [
+            "modeling_ctrl.py",
+            "modeling_tf_ctrl.py",
+        ]
+        self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
+
+    def test_tokenization_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "tokenization"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_configuration_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "configuration"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_remaining_examples(self):
+        transformers_directory = Path("src/transformers")
+        n_identifiers = ["configuration", "modeling", "tokenization"]
+        self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
+
+    def test_doc_sources(self):
+        doc_source_directory = Path("docs/source")
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
diff --git a/test_feature_extraction_auto.py b/test_feature_extraction_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..7502e8422431f5dbd3ded5ea80fb439cf229a270
--- /dev/null
+++ b/test_feature_extraction_auto.py
@@ -0,0 +1,39 @@
+# coding=utf-8
+# Copyright 2021 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
+
+
+SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
+SAMPLE_FEATURE_EXTRACTION_CONFIG = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy_feature_extractor_config.json"
+)
+
+
+class AutoFeatureExtractorTest(unittest.TestCase):
+    def test_feature_extractor_from_model_shortcut(self):
+        config = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
+
+    def test_feature_extractor_from_local_directory(self):
+        config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
+        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
+
+    def test_feature_extractor_from_local_file(self):
+        config = AutoFeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG)
+        self.assertIsInstance(config, Wav2Vec2FeatureExtractor)
diff --git a/test_feature_extraction_clip.py b/test_feature_extraction_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac10af6f43a9cd5288ddf73e739a14657225be7
--- /dev/null
+++ b/test_feature_extraction_clip.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPFeatureExtractor
+
+
+class CLIPFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=20,
+        do_center_crop=True,
+        crop_size=18,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+        if equal_resolution:
+            image_inputs = []
+            for i in range(self.batch_size):
+                image_inputs.append(
+                    np.random.randint(
+                        255, size=(self.num_channels, self.max_resolution, self.max_resolution), dtype=np.uint8
+                    )
+                )
+        else:
+            image_inputs = []
+            for i in range(self.batch_size):
+                width, height = np.random.choice(np.arange(self.min_resolution, self.max_resolution), 2)
+                image_inputs.append(np.random.randint(255, size=(self.num_channels, width, height), dtype=np.uint8))
+
+        if not numpify and not torchify:
+            # PIL expects the channel dimension as last dimension
+            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        if torchify:
+            image_inputs = [torch.from_numpy(x) for x in image_inputs]
+
+        return image_inputs
+
+
+@require_torch
+@require_vision
+class CLIPFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = CLIPFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = CLIPFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
+        self.assertTrue(hasattr(feature_extractor, "center_crop"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
diff --git a/test_feature_extraction_common.py b/test_feature_extraction_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..217da135ca1cd3eebafb16299cb9d64de7f9e0fc
--- /dev/null
+++ b/test_feature_extraction_common.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import tempfile
+
+from transformers.file_utils import is_torch_available, is_vision_available
+
+
+if is_torch_available():
+    import numpy as np
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+
+def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+    or a list of PyTorch tensors if one specifies torchify=True.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    if equal_resolution:
+        image_inputs = []
+        for i in range(feature_extract_tester.batch_size):
+            image_inputs.append(
+                np.random.randint(
+                    255,
+                    size=(
+                        feature_extract_tester.num_channels,
+                        feature_extract_tester.max_resolution,
+                        feature_extract_tester.max_resolution,
+                    ),
+                    dtype=np.uint8,
+                )
+            )
+    else:
+        image_inputs = []
+        for i in range(feature_extract_tester.batch_size):
+            width, height = np.random.choice(
+                np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
+            )
+            image_inputs.append(
+                np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
+            )
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+    if torchify:
+        image_inputs = [torch.from_numpy(x) for x in image_inputs]
+
+    return image_inputs
+
+
+class FeatureExtractionSavingTestMixin:
+    def test_feat_extract_to_json_string(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        obj = json.loads(feat_extract.to_json_string())
+        for key, value in self.feat_extract_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
+
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            feat_extract_first.save_pretrained(tmpdirname)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
+
+    def test_init_without_params(self):
+        feat_extract = self.feature_extraction_class()
+        self.assertIsNotNone(feat_extract)
diff --git a/test_feature_extraction_deit.py b/test_feature_extraction_deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc86074dc98eb177c0e2db5f8dd2232eba45b198
--- /dev/null
+++ b/test_feature_extraction_deit.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DeiTFeatureExtractor
+
+
+class DeiTFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=20,
+        do_center_crop=True,
+        crop_size=18,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+
+@require_torch
+@require_vision
+class DeiTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = DeiTFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = DeiTFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "do_center_crop"))
+        self.assertTrue(hasattr(feature_extractor, "center_crop"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.crop_size,
+                self.feature_extract_tester.crop_size,
+            ),
+        )
diff --git a/test_feature_extraction_detr.py b/test_feature_extraction_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f36ad418f52a5becfb894e04efca2ae246a9dfe
--- /dev/null
+++ b/test_feature_extraction_detr.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import pathlib
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DetrFeatureExtractor
+
+
+class DetrFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        max_size=1333,  # by setting max_size > max_resolution we're effectively not testing this :p
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.max_size = max_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_size": self.max_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to DetrFeatureExtractor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            image = image_inputs[0]
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[1], image.shape[2]
+            if w < h:
+                expected_height = int(self.size * h / w)
+                expected_width = self.size
+            elif w > h:
+                expected_height = self.size
+                expected_width = int(self.size * w / h)
+            else:
+                expected_height = self.size
+                expected_width = self.size
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+
+@require_torch
+@require_vision
+class DetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = DetrFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = DetrFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+        self.assertTrue(hasattr(feature_extractor, "max_size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (1, self.feature_extract_tester.num_channels, expected_height, expected_width),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+
+        expected_height, expected_width = self.feature_extract_tester.get_expected_values(image_inputs, batched=True)
+
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_equivalence_pad_and_create_pixel_mask(self):
+        # Initialize feature_extractors
+        feature_extractor_1 = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor_2 = self.feature_extraction_class(do_resize=False, do_normalize=False)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors
+        encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
+        encoded_images = feature_extractor_2(image_inputs, return_tensors="pt")
+
+        assert torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
+        assert torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
+
+    @slow
+    def test_call_pytorch_with_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        # TODO replace by facebook/detr-resnet-50
+        feature_extractor = DetrFeatureExtractor.from_pretrained("nielsr/detr-resnet-50")
+        encoding = feature_extractor(images=image, annotations=target, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
+        assert torch.allclose(encoding["target"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["target"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
+        assert torch.allclose(encoding["target"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["target"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["target"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
+        assert torch.allclose(encoding["target"][0]["class_labels"], expected_class_labels)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["target"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["target"][0]["size"], expected_size)
+
+    @slow
+    def test_call_pytorch_with_coco_panoptic_annotations(self):
+        # prepare image, target and masks_path
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
+
+        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
+
+        # encode them
+        # TODO replace by .from_pretrained facebook/detr-resnet-50-panoptic
+        feature_extractor = DetrFeatureExtractor(format="coco_panoptic")
+        encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
+
+        # verify pixel values
+        expected_shape = torch.Size([1, 3, 800, 1066])
+        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
+
+        expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
+        assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)
+
+        # verify area
+        expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
+        assert torch.allclose(encoding["target"][0]["area"], expected_area)
+        # verify boxes
+        expected_boxes_shape = torch.Size([6, 4])
+        self.assertEqual(encoding["target"][0]["boxes"].shape, expected_boxes_shape)
+        expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
+        assert torch.allclose(encoding["target"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)
+        # verify image_id
+        expected_image_id = torch.tensor([39769])
+        assert torch.allclose(encoding["target"][0]["image_id"], expected_image_id)
+        # verify is_crowd
+        expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
+        assert torch.allclose(encoding["target"][0]["iscrowd"], expected_is_crowd)
+        # verify class_labels
+        expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
+        assert torch.allclose(encoding["target"][0]["class_labels"], expected_class_labels)
+        # verify masks
+        expected_masks_sum = 822338
+        self.assertEqual(encoding["target"][0]["masks"].sum().item(), expected_masks_sum)
+        # verify orig_size
+        expected_orig_size = torch.tensor([480, 640])
+        assert torch.allclose(encoding["target"][0]["orig_size"], expected_orig_size)
+        # verify size
+        expected_size = torch.tensor([800, 1066])
+        assert torch.allclose(encoding["target"][0]["size"], expected_size)
diff --git a/test_feature_extraction_speech_to_text.py b/test_feature_extraction_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90beef01377dc8a4a631355565907e2d9d42a71
--- /dev/null
+++ b/test_feature_extraction_speech_to_text.py
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import is_speech_available
+from transformers.testing_utils import require_torch, require_torchaudio
+
+from .test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_speech_available():
+    from transformers import Speech2TextFeatureExtractor
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+@require_torchaudio
+class Speech2TextFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=24,
+        num_mel_bins=24,
+        padding_value=0.0,
+        sampling_rate=16_000,
+        return_attention_mask=True,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.num_mel_bins = num_mel_bins
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "num_mel_bins": self.num_mel_bins,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+@require_torch
+@require_torchaudio
+class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+
+    feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None
+
+    def setUp(self):
+        self.feat_extract_tester = Speech2TextFeatureExtractionTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 3)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_cepstral_mean_and_variance_normalization(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        inputs = feature_extractor(speech_inputs, padding=True, return_tensors="np", return_attention_mask=True)
+        input_features = inputs.input_features
+        attention_mask = inputs.attention_mask
+        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
+
+        def _check_zero_mean_unit_variance(input_vector):
+            self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
+            self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
+
+        _check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
+        _check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
+        _check_zero_mean_unit_variance(input_features[2, : fbank_feat_lengths[2]])
diff --git a/test_feature_extraction_vit.py b/test_feature_extraction_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..283a94d8ac1bb9d3cccb8c9a9d5e1ecc021059db
--- /dev/null
+++ b/test_feature_extraction_vit.py
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTFeatureExtractor
+
+
+class ViTFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+
+@require_torch
+@require_vision
+class ViTFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = ViTFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = ViTFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                1,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_images.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
diff --git a/test_feature_extraction_wav2vec2.py b/test_feature_extraction_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55d951ee3ec8dc467534d58d257df13f8b50b2d
--- /dev/null
+++ b/test_feature_extraction_wav2vec2.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST, Wav2Vec2Config, Wav2Vec2FeatureExtractor
+from transformers.testing_utils import require_torch, slow
+
+from .test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class Wav2Vec2FeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        padding_value=0.0,
+        sampling_rate=16000,
+        return_attention_mask=True,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            speech_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+
+        return speech_inputs
+
+
+class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+
+    feature_extraction_class = Wav2Vec2FeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = Wav2Vec2FeatureExtractionTester(self)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test not batched input
+        encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_zero_mean_unit_variance_normalization(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        processed = feat_extract(speech_inputs, padding="longest")
+        input_values = processed.input_values
+
+        def _check_zero_mean_unit_variance(input_vector):
+            self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3)
+            self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3)
+
+        _check_zero_mean_unit_variance(input_values[0, :800])
+        _check_zero_mean_unit_variance(input_values[1, :1000])
+        _check_zero_mean_unit_variance(input_values[2])
+
+    @slow
+    @require_torch
+    def test_pretrained_checkpoints_are_set_correctly(self):
+        # this test makes sure that models that are using
+        # group norm don't have their feature extractor return the
+        # attention_mask
+        for model_id in WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST:
+            config = Wav2Vec2Config.from_pretrained(model_id)
+            feat_extract = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
+
+            # only "layer" feature extraction norm should make use of
+            # attention_mask
+            self.assertEqual(feat_extract.return_attention_mask, config.feat_extract_norm == "layer")
diff --git a/test_file_utils.py b/test_file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..63f665647b30642d6049db52882669fd74ca4a02
--- /dev/null
+++ b/test_file_utils.py
@@ -0,0 +1,80 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import requests
+
+# Try to import everything from transformers to ensure every object can be loaded.
+from transformers import *  # noqa F406
+from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME, filename_to_url, get_from_cache, hf_bucket_url
+from transformers.testing_utils import DUMMY_UNKWOWN_IDENTIFIER
+
+
+MODEL_ID = DUMMY_UNKWOWN_IDENTIFIER
+# An actual model hosted on huggingface.co
+
+REVISION_ID_DEFAULT = "main"
+# Default branch name
+REVISION_ID_ONE_SPECIFIC_COMMIT = "f2c752cfc5c0ab6f4bdec59acea69eefbee381c2"
+# One particular commit (not the top of `main`)
+REVISION_ID_INVALID = "aaaaaaa"
+# This commit does not exist, so we should 404.
+
+PINNED_SHA1 = "d9e9f15bc825e4b2c9249e9578f884bbcb5e3684"
+# Sha-1 of config.json on the top of `main`, for checking purposes
+PINNED_SHA256 = "4b243c475af8d0a7754e87d7d096c92e5199ec2fe168a2ee7998e3b8e9bcb1d3"
+# Sha-256 of pytorch_model.bin on the top of `main`, for checking purposes
+
+
+class GetFromCacheTests(unittest.TestCase):
+    def test_bogus_url(self):
+        # This lets us simulate no connection
+        # as the error raised is the same
+        # `ConnectionError`
+        url = "https://bogus"
+        with self.assertRaisesRegex(ValueError, "Connection error"):
+            _ = get_from_cache(url)
+
+    def test_file_not_found(self):
+        # Valid revision (None) but missing file.
+        url = hf_bucket_url(MODEL_ID, filename="missing.bin")
+        with self.assertRaisesRegex(requests.exceptions.HTTPError, "404 Client Error"):
+            _ = get_from_cache(url)
+
+    def test_revision_not_found(self):
+        # Valid file but missing revision
+        url = hf_bucket_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_INVALID)
+        with self.assertRaisesRegex(requests.exceptions.HTTPError, "404 Client Error"):
+            _ = get_from_cache(url)
+
+    def test_standard_object(self):
+        url = hf_bucket_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_DEFAULT)
+        filepath = get_from_cache(url, force_download=True)
+        metadata = filename_to_url(filepath)
+        self.assertEqual(metadata, (url, f'"{PINNED_SHA1}"'))
+
+    def test_standard_object_rev(self):
+        # Same object, but different revision
+        url = hf_bucket_url(MODEL_ID, filename=CONFIG_NAME, revision=REVISION_ID_ONE_SPECIFIC_COMMIT)
+        filepath = get_from_cache(url, force_download=True)
+        metadata = filename_to_url(filepath)
+        self.assertNotEqual(metadata[1], f'"{PINNED_SHA1}"')
+        # Caution: check that the etag is *not* equal to the one from `test_standard_object`
+
+    def test_lfs_object(self):
+        url = hf_bucket_url(MODEL_ID, filename=WEIGHTS_NAME, revision=REVISION_ID_DEFAULT)
+        filepath = get_from_cache(url, force_download=True)
+        metadata = filename_to_url(filepath)
+        self.assertEqual(metadata, (url, f'"{PINNED_SHA256}"'))
diff --git a/test_flax_auto.py b/test_flax_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..41c5d0d796ed289829be5596e31bca9ae016a0e4
--- /dev/null
+++ b/test_flax_auto.py
@@ -0,0 +1,78 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AutoConfig, AutoTokenizer, BertConfig, TensorType, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+
+if is_flax_available():
+    import jax
+    from transformers.models.auto.modeling_flax_auto import FlaxAutoModel
+    from transformers.models.bert.modeling_flax_bert import FlaxBertModel
+    from transformers.models.roberta.modeling_flax_roberta import FlaxRobertaModel
+
+
+@require_flax
+class FlaxAutoModelTest(unittest.TestCase):
+    @slow
+    def test_bert_from_pretrained(self):
+        for model_name in ["bert-base-cased", "bert-large-uncased"]:
+            with self.subTest(model_name):
+                config = AutoConfig.from_pretrained(model_name)
+                self.assertIsNotNone(config)
+                self.assertIsInstance(config, BertConfig)
+
+                model = FlaxAutoModel.from_pretrained(model_name)
+                self.assertIsNotNone(model)
+                self.assertIsInstance(model, FlaxBertModel)
+
+    @slow
+    def test_roberta_from_pretrained(self):
+        for model_name in ["roberta-base", "roberta-large"]:
+            with self.subTest(model_name):
+                config = AutoConfig.from_pretrained(model_name)
+                self.assertIsNotNone(config)
+                self.assertIsInstance(config, BertConfig)
+
+                model = FlaxAutoModel.from_pretrained(model_name)
+                self.assertIsNotNone(model)
+                self.assertIsInstance(model, FlaxRobertaModel)
+
+    @slow
+    def test_bert_jax_jit(self):
+        for model_name in ["bert-base-cased", "bert-large-uncased"]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = FlaxBertModel.from_pretrained(model_name)
+            tokens = tokenizer("Do you support jax jitted function?", return_tensors=TensorType.JAX)
+
+            @jax.jit
+            def eval(**kwargs):
+                return model(**kwargs)
+
+            eval(**tokens).block_until_ready()
+
+    @slow
+    def test_roberta_jax_jit(self):
+        for model_name in ["roberta-base", "roberta-large"]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = FlaxRobertaModel.from_pretrained(model_name)
+            tokens = tokenizer("Do you support jax jitted function?", return_tensors=TensorType.JAX)
+
+            @jax.jit
+            def eval(**kwargs):
+                return model(**kwargs)
+
+            eval(**tokens).block_until_ready()
diff --git a/test_generation_beam_search.py b/test_generation_beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdbe35eafaa4494fe98f4768c2c68bf03c1ef703
--- /dev/null
+++ b/test_generation_beam_search.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, torch_device
+
+from .test_modeling_common import floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers.generation_beam_search import BeamHypotheses, BeamSearchScorer
+
+
+class BeamSearchTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        sequence_length=10,
+        vocab_size=99,
+        pad_token_id=0,
+        max_length=20,
+        num_beams=4,
+        length_penalty=2.0,
+        do_early_stopping=True,
+        num_beam_hyps_to_keep=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.pad_token_id = pad_token_id
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+
+        # cannot be randomely generated
+        self.eos_token_id = vocab_size + 1
+
+    def prepare_beam_scorer(self, **kwargs):
+        return BeamSearchScorer(
+            batch_size=kwargs.get("batch_size", self.batch_size),
+            num_beams=kwargs.get("num_beams", self.num_beams),
+            device=torch_device,
+            length_penalty=kwargs.get("length_penalty", self.length_penalty),
+            do_early_stopping=kwargs.get("do_early_stopping", self.do_early_stopping),
+            num_beam_hyps_to_keep=kwargs.get("num_beam_hyps_to_keep", self.num_beam_hyps_to_keep),
+        )
+
+    def prepare_inputs(self):
+        input_ids = ids_tensor((self.batch_size * self.num_beams, self.sequence_length), self.vocab_size)
+        next_tokens = ids_tensor((self.batch_size, 2 * self.num_beams), self.vocab_size).to(torch_device)
+        next_indices = ids_tensor((self.batch_size, 2 * self.num_beams), self.num_beams).to(torch_device)
+        next_scores, _ = (-floats_tensor((self.batch_size, 2 * self.num_beams)).to(torch_device)).sort(descending=True)
+        return (input_ids, next_tokens, next_indices, next_scores)
+
+    def check_beam_hypotheses(self, input_ids, *args):
+        # check that correct number of beam hypotheses is set in beam scorer
+        beam_scorer = self.prepare_beam_scorer(do_early_stopping=True)
+        beam_hyp = beam_scorer._beam_hyps[0]
+
+        self.parent.assertEqual(len(beam_scorer._beam_hyps), self.batch_size)
+
+        # check correct type
+        self.parent.assertTrue(isinstance(beam_hyp, BeamHypotheses))
+
+        # check that num_beams is correctly set
+        self.parent.assertEqual(beam_hyp.num_beams, self.num_beams)
+
+        # check for early stopping deactivated
+        for beam_idx in range(self.num_beams):
+            beam_hyp.add(input_ids[beam_idx], -10.0)
+
+        # if early stopping True -> score does not matter
+        self.parent.assertTrue(beam_hyp.is_done(-10.0, 5))
+
+        # re-init
+        beam_scorer = self.prepare_beam_scorer(do_early_stopping=False)
+        beam_hyp = beam_scorer._beam_hyps[0]
+
+        # add `num_beams + 1` beams to change `worst_score`
+        for beam_idx in range(self.num_beams + 1):
+            beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx))
+
+        # -10.0 is removed => -9.0 is worst score
+        self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length ** beam_hyp.length_penalty))
+
+        # -5.0 is better than worst score => should not be finished
+        self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length))
+
+        # -20.0 is worse than worst score => should be finished
+        self.parent.assertTrue(beam_hyp.is_done(-20.0, self.sequence_length))
+
+    def check_beam_scorer_update(self, input_ids, next_tokens, next_indices, next_scores):
+        # check too many eos tokens
+        beam_scorer = self.prepare_beam_scorer()
+
+        tokens = next_tokens.clone()
+        tokens[0, :] = self.eos_token_id
+
+        with self.parent.assertRaises(ValueError):
+            beam_scorer.process(input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id)
+
+        # check all batches are done
+        beam_scorer = self.prepare_beam_scorer()
+
+        tokens = next_tokens.clone()
+        tokens[:, : self.num_beams] = self.eos_token_id
+        beam_scorer.process(input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id)
+        # beam scorer should be done
+        self.parent.assertTrue(beam_scorer.is_done)
+
+        # check
+        beam_scorer = self.prepare_beam_scorer()
+
+        tokens = next_tokens.clone()
+        tokens[:, 1] = self.eos_token_id
+        beam_outputs = beam_scorer.process(
+            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id
+        )
+        output_scores = beam_outputs["next_beam_scores"]
+        output_tokens = beam_outputs["next_beam_tokens"]
+        output_indices = beam_outputs["next_beam_indices"]
+
+        def cut_expected_tensor(tensor):
+            return torch.cat([tensor[:, :1], tensor[:, 2 : self.num_beams + 1]], dim=1).flatten()
+
+        # check all outptus
+        # cut out id of eos token and take best `num_beams` outputs
+        expected_output_tokens = cut_expected_tensor(tokens)
+        expected_output_scores = cut_expected_tensor(next_scores)
+
+        # add num_beams * batch_idx
+        expected_output_indices = (
+            cut_expected_tensor(next_indices)
+            + (torch.arange(self.num_beams * self.batch_size, device=torch_device) // self.num_beams) * self.num_beams
+        )
+
+        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
+        self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
+        self.parent.assertTrue(torch.allclose(expected_output_scores, output_scores, atol=1e-3))
+
+        # make sure ids of eos token are correctly saved in beam_hyps of beam scorer
+        for batch_idx in range(self.batch_size):
+            correct_idx = batch_idx * self.num_beams + next_indices[batch_idx, 1]
+            self.parent.assertListEqual(
+                input_ids[correct_idx].tolist(), beam_scorer._beam_hyps[batch_idx].beams[0][-1].tolist()
+            )
+
+    def check_beam_scores_finalize(self, input_ids, next_tokens, next_indices, next_scores):
+        # max_length should be only one more than current input_ids to check that eos is correctly appended
+        max_length = self.sequence_length + 1
+        beam_scorer = self.prepare_beam_scorer(num_beam_hyps_to_keep=1, length_penalty=1.0, do_early_stopping=False)
+
+        # update beams and append to input_ids
+        tokens = next_tokens.clone()
+        # first batch, first output has to finish with eos token id since scores are correctly sorted
+        tokens[0, 0] = self.eos_token_id
+        # make sure corresponding score is as good as possible to surely be picked first
+        next_scores[0, 0] = 0.0
+        beam_outputs = beam_scorer.process(
+            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id
+        )
+        output_scores = beam_outputs["next_beam_scores"]
+        output_tokens = beam_outputs["next_beam_tokens"]
+        output_indices = beam_outputs["next_beam_indices"]
+
+        input_ids = torch.cat([input_ids[output_indices, :], output_tokens.unsqueeze(-1)], dim=-1)
+
+        # finalize
+        sequence_output = beam_scorer.finalize(
+            input_ids,
+            output_scores,
+            output_tokens,
+            output_indices,
+            pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_token_id,
+            max_length=max_length,
+        )
+
+        sequences = sequence_output["sequences"]
+        sequence_scores = sequence_output["sequence_scores"]
+
+        # since `num_beam_hyps_to_keep` = 1 => only return `batch_size` x `max_length`
+        self.parent.assertListEqual(list(sequences.shape), [self.batch_size, max_length])
+        self.parent.assertListEqual(list(sequence_scores.shape), [self.batch_size])
+
+        # check sequence_scores
+        self.parent.assertFalse((sequence_scores > 0).any().item())
+
+        # first batch has to finish with eos_token
+        self.parent.assertEqual(sequences[0, -1].item(), self.eos_token_id)
+
+        # other batches cannot finish with eos token
+        self.parent.assertNotEqual(sequences[1, -1].item(), self.eos_token_id)
+        self.parent.assertNotEqual(sequences[2, -1].item(), self.eos_token_id)
+
+        # now test that if `num_beam_hyps_to_keep` is 3 => all beams are returned
+        beam_scorer.num_beam_hyps_to_keep = self.num_beams
+        sequence_output = beam_scorer.finalize(
+            input_ids,
+            output_scores,
+            output_tokens,
+            output_indices,
+            pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_token_id,
+            max_length=max_length,
+        )
+        sequences = sequence_output["sequences"]
+        sequence_scores = sequence_output["sequence_scores"]
+
+        self.parent.assertListEqual(list(sequences.shape), [self.num_beams * self.batch_size, max_length])
+        self.parent.assertListEqual(list(sequence_scores.shape), [self.num_beams * self.batch_size])
+
+
+@require_torch
+class BeamSearchTest(unittest.TestCase):
+    def setUp(self):
+        self.beam_search_tester = BeamSearchTester(self)
+
+    def test_beam_hypotheses(self):
+        inputs = self.beam_search_tester.prepare_inputs()
+        self.beam_search_tester.check_beam_hypotheses(*inputs)
+
+    def test_beam_scorer_update(self):
+        inputs = self.beam_search_tester.prepare_inputs()
+        self.beam_search_tester.check_beam_scorer_update(*inputs)
+
+    def test_beam_scorer_finalize(self):
+        inputs = self.beam_search_tester.prepare_inputs()
+        self.beam_search_tester.check_beam_scores_finalize(*inputs)
diff --git a/test_generation_flax_logits_process.py b/test_generation_flax_logits_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dacb5dc0ad9b5fe2769004a179fbed5b9f4a331
--- /dev/null
+++ b/test_generation_flax_logits_process.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers import is_flax_available
+from transformers.testing_utils import require_flax
+
+from .test_modeling_flax_common import ids_tensor
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from transformers.generation_flax_logits_process import (
+        FlaxLogitsProcessorList,
+        FlaxTemperatureLogitsWarper,
+        FlaxTopKLogitsWarper,
+        FlaxTopPLogitsWarper,
+    )
+
+
+@require_flax
+class LogitsProcessorTest(unittest.TestCase):
+    def _get_uniform_logits(self, batch_size: int, length: int):
+        scores = np.ones((batch_size, length)) / length
+        return scores
+
+    def test_temperature_dist_warper(self):
+        input_ids = None
+        length = 20
+
+        scores = self._get_uniform_logits(batch_size=2, length=length)
+
+        # tweak scores to not be uniform anymore
+        scores[1, 5] = (1 / length) + 0.1  # peak, 1st batch
+        scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
+
+        # compute softmax
+        probs = jax.nn.softmax(scores, axis=-1)
+
+        temp_dist_warper_sharper = FlaxTemperatureLogitsWarper(temperature=0.5)
+        temp_dist_warper_smoother = FlaxTemperatureLogitsWarper(temperature=1.3)
+
+        warped_prob_sharp = jax.nn.softmax(temp_dist_warper_sharper(input_ids, scores.copy()), axis=-1)
+        warped_prob_smooth = jax.nn.softmax(temp_dist_warper_smoother(input_ids, scores.copy()), axis=-1)
+
+        # uniform distribution stays uniform
+        self.assertTrue(jnp.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
+        self.assertTrue(jnp.allclose(probs[0, :], warped_prob_smooth[0, :], atol=1e-3))
+
+        # sharp peaks get higher, valleys get lower
+        self.assertLess(probs[1, :].max(), warped_prob_sharp[1, :].max())
+        self.assertGreater(probs[1, :].min(), warped_prob_sharp[1, :].min())
+
+        # smooth peaks get lower, valleys get higher
+        self.assertGreater(probs[1, :].max(), warped_prob_smooth[1, :].max())
+        self.assertLess(probs[1, :].min(), warped_prob_smooth[1, :].min())
+
+    def test_top_k_dist_warper(self):
+        input_ids = None
+        vocab_size = 10
+        batch_size = 2
+
+        # create ramp distribution
+        ramp_logits = np.broadcast_to(np.arange(vocab_size)[None, :], (batch_size, vocab_size)).copy()
+        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
+
+        top_k_warp = FlaxTopKLogitsWarper(3)
+
+        scores = top_k_warp(input_ids, ramp_logits)
+
+        # check that correct tokens are filtered
+        self.assertListEqual(jnp.isinf(scores[0]).tolist(), 7 * [True] + 3 * [False])
+        self.assertListEqual(jnp.isinf(scores[1]).tolist(), 2 * [True] + 3 * [False] + 5 * [True])
+
+        # check special case
+        length = 5
+        top_k_warp_safety_check = FlaxTopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
+
+        ramp_logits = np.broadcast_to(np.arange(length)[None, :], (batch_size, length)).copy()
+        scores = top_k_warp_safety_check(input_ids, ramp_logits)
+
+        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
+        self.assertListEqual((scores == 0.0).sum(axis=-1).tolist(), [2, 2])
+
+    def test_top_p_dist_warper(self):
+        input_ids = None
+        vocab_size = 10
+        batch_size = 2
+
+        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
+        dist = np.log(np.array([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]]))
+
+        top_p_warp = FlaxTopPLogitsWarper(0.7)
+        filtered_dist = np.exp(top_p_warp(input_ids, dist))
+
+        # dist should be filtered to keep min num values so that sum is >= 0.7
+        # exp (-inf) => 0
+        EXPECTED_FILTERED_DIST = np.array([[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]])
+        self.assertTrue(np.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
+
+        # check edge cases with negative and extreme logits
+        ramp_logits = np.broadcast_to(np.arange(vocab_size)[None, :], (batch_size, vocab_size)).copy() - (
+            vocab_size // 2
+        )
+
+        # make ramp_logits more extreme
+        ramp_logits[1] = ramp_logits[1] * 100.0
+
+        # make sure at least 2 tokens are kept
+        top_p_warp = FlaxTopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
+        filtered_dist = top_p_warp(input_ids, ramp_logits)
+
+        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
+        self.assertListEqual((filtered_dist != 0.0).sum(axis=-1).tolist(), [3, 2])
+
+    def test_processor_list(self):
+        batch_size = 4
+        sequence_length = 10
+        vocab_size = 15
+
+        # dummy input_ids and scores
+        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
+        input_ids_comp = input_ids.copy()
+
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_comp = scores.copy()
+
+        # instantiate all dist processors
+        temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5)
+        top_k_warp = FlaxTopKLogitsWarper(3)
+        top_p_warp = FlaxTopPLogitsWarper(0.8)
+
+        # no processor list
+        scores = temp_dist_warp(input_ids, scores)
+        scores = top_k_warp(input_ids, scores)
+        scores = top_p_warp(input_ids, scores)
+
+        # with processor list
+        processor = FlaxLogitsProcessorList([temp_dist_warp, top_k_warp, top_p_warp])
+        scores_comp = processor(input_ids, scores_comp)
+
+        # scores should be equal
+        self.assertTrue(jnp.allclose(scores, scores_comp, atol=1e-3))
+
+        # input_ids should never be changed
+        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
diff --git a/test_generation_flax_utils.py b/test_generation_flax_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b3e529c1859a492d6e93c729716adae08408bc6
--- /dev/null
+++ b/test_generation_flax_utils.py
@@ -0,0 +1,170 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+
+from transformers import is_flax_available
+from transformers.testing_utils import require_flax
+
+
+if is_flax_available():
+    import os
+
+    import jax
+    import jax.numpy as jnp
+    from jax import jit
+
+    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
+
+
+def ids_tensor(shape, vocab_size, rng=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    output = np.array(values, dtype=jnp.int32).reshape(shape)
+
+    return output
+
+
+def random_attention_mask(shape, rng=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=rng)
+    # make sure that at least one token is attended to for each batch
+    attn_mask[:, -1] = 1
+    return attn_mask
+
+
+@require_flax
+class FlaxGenerationTesterMixin:
+    model_tester = None
+    all_generative_model_classes = ()
+
+    def _get_input_ids_and_config(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # cut to half length & take max batch_size 3
+        max_batch_size = 2
+        sequence_length = inputs["input_ids"].shape[-1] // 2
+        input_ids = inputs["input_ids"][:max_batch_size, :sequence_length]
+
+        attention_mask = jnp.ones_like(input_ids)
+        attention_mask = attention_mask[:max_batch_size, :sequence_length]
+
+        # generate max 5 tokens
+        max_length = input_ids.shape[-1] + 5
+        if config.eos_token_id is not None and config.pad_token_id is None:
+            # hack to allow generate for models such as GPT2 as is done in `generate()`
+            config.pad_token_id = config.eos_token_id
+        return config, input_ids, attention_mask, max_length
+
+    def test_greedy_generate(self):
+        config, input_ids, _, max_length = self._get_input_ids_and_config()
+        config.do_sample = False
+        config.max_length = max_length
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids).sequences
+            self.assertEqual(generation_outputs.shape[-1], max_length)
+
+            jit_generate = jit(model.generate)
+            jit_generation_outputs = jit_generate(input_ids).sequences
+
+            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
+
+    def test_sample_generate(self):
+        config, input_ids, _, max_length = self._get_input_ids_and_config()
+        config.do_sample = True
+        config.max_length = max_length
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids).sequences
+            self.assertEqual(generation_outputs.shape[-1], max_length)
+
+            jit_generate = jit(model.generate)
+            jit_generation_outputs = jit_generate(input_ids).sequences
+
+            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
+
+    def test_sample_generate_logits_warper(self):
+        config, input_ids, _, max_length = self._get_input_ids_and_config()
+        config.do_sample = True
+        config.max_length = max_length
+        config.temperature = 0.8
+        config.top_k = 10
+        config.top_p = 0.3
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids).sequences
+            self.assertEqual(generation_outputs.shape[-1], max_length)
+
+            jit_generate = jit(model.generate)
+            jit_generation_outputs = jit_generate(input_ids).sequences
+
+            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
+
+    def test_greedy_generate_attn_mask(self):
+        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+        # pad attention mask on the left
+        attention_mask = jax.ops.index_update(attention_mask, (0, 0), 0)
+
+        config.do_sample = False
+        config.max_length = max_length
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
+            self.assertEqual(generation_outputs.shape[-1], max_length)
+
+            jit_generate = jit(model.generate)
+            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
+
+            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
+
+    def test_sample_generate_attn_mask(self):
+        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+        # pad attention mask on the left
+        attention_mask = jax.ops.index_update(attention_mask, (0, 0), 0)
+
+        config.do_sample = True
+        config.max_length = max_length
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
+            self.assertEqual(generation_outputs.shape[-1], max_length)
+
+            jit_generate = jit(model.generate)
+            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
+
+            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
diff --git a/test_generation_logits_process.py b/test_generation_logits_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..e07fd3066e2ed5fe9eee12334a2d8b6093107582
--- /dev/null
+++ b/test_generation_logits_process.py
@@ -0,0 +1,460 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, torch_device
+
+from .test_modeling_common import ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers.generation_logits_process import (
+        EncoderNoRepeatNGramLogitsProcessor,
+        ForcedBOSTokenLogitsProcessor,
+        ForcedEOSTokenLogitsProcessor,
+        HammingDiversityLogitsProcessor,
+        InfNanRemoveLogitsProcessor,
+        LogitsProcessorList,
+        MinLengthLogitsProcessor,
+        NoBadWordsLogitsProcessor,
+        NoRepeatNGramLogitsProcessor,
+        PrefixConstrainedLogitsProcessor,
+        RepetitionPenaltyLogitsProcessor,
+        TemperatureLogitsWarper,
+        TopKLogitsWarper,
+        TopPLogitsWarper,
+    )
+
+
+@require_torch
+class LogitsProcessorTest(unittest.TestCase):
+    def _get_uniform_logits(self, batch_size: int, length: int):
+        scores = torch.ones((batch_size, length), device=torch_device, dtype=torch.float) / length
+        return scores
+
+    def test_min_lenght_dist_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        eos_token_id = 0
+
+        min_dist_processor = MinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
+
+        # check that min length is applied at length 5
+        input_ids = ids_tensor((batch_size, 5), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_before_min_length = min_dist_processor(input_ids, scores)
+        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), 4 * [-float("inf")])
+
+        # check that min length is not applied anymore at length 15
+        input_ids = ids_tensor((batch_size, 15), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_before_min_length = min_dist_processor(input_ids, scores)
+        self.assertFalse(torch.isinf(scores_before_min_length).any())
+
+    def test_temperature_dist_warper(self):
+        input_ids = None
+        length = 20
+
+        scores = self._get_uniform_logits(batch_size=2, length=length)
+
+        # tweak scores to not be uniform anymore
+        scores[1, 5] = (1 / length) + 0.1  # peak, 1st batch
+        scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
+
+        # compute softmax
+        probs = nn.functional.softmax(scores, dim=-1)
+
+        temp_dist_warper_sharper = TemperatureLogitsWarper(temperature=0.5)
+        temp_dist_warper_smoother = TemperatureLogitsWarper(temperature=1.3)
+
+        warped_prob_sharp = nn.functional.softmax(temp_dist_warper_sharper(input_ids, scores.clone()), dim=-1)
+        warped_prob_smooth = nn.functional.softmax(temp_dist_warper_smoother(input_ids, scores.clone()), dim=-1)
+
+        # uniform distribution stays uniform
+        self.assertTrue(torch.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
+        self.assertTrue(torch.allclose(probs[0, :], warped_prob_smooth[0, :], atol=1e-3))
+
+        # sharp peaks get higher, valleys get lower
+        self.assertLess(probs[1, :].max(), warped_prob_sharp[1, :].max())
+        self.assertGreater(probs[1, :].min(), warped_prob_sharp[1, :].min())
+
+        # smooth peaks get lower, valleys get higher
+        self.assertGreater(probs[1, :].max(), warped_prob_smooth[1, :].max())
+        self.assertLess(probs[1, :].min(), warped_prob_smooth[1, :].min())
+
+    def test_repetition_penalty_dist_process(self):
+        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
+        vocab_size = 10
+
+        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
+
+        # give values special values
+        scores[0, 0] = -(1 / vocab_size)
+        scores[1, 5] = 4 / vocab_size
+
+        rep_penalty_proc = RepetitionPenaltyLogitsProcessor(penalty=2.0)
+
+        scores = rep_penalty_proc(input_ids, scores.clone())
+
+        # check that values were correctly changed
+        self.assertAlmostEqual(scores[0, 0].item(), -(1 / vocab_size) * 2)
+        self.assertAlmostEqual(scores[0, 1].item(), (1 / vocab_size) / 2)
+
+        self.assertAlmostEqual(scores[1, 0].item(), (1 / vocab_size) / 2)
+        self.assertAlmostEqual(scores[1, 5].item(), (4 / vocab_size) / 2)
+
+    def test_top_k_dist_warper(self):
+        input_ids = None
+        vocab_size = 10
+        batch_size = 2
+
+        # create ramp distribution
+        ramp_logits = (
+            torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(batch_size, 1)
+        )
+        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
+
+        top_k_warp = TopKLogitsWarper(3)
+
+        scores = top_k_warp(input_ids, ramp_logits)
+
+        # check that correct tokens are filtered
+        self.assertListEqual(torch.isinf(scores[0]).tolist(), 7 * [True] + 3 * [False])
+        self.assertListEqual(torch.isinf(scores[1]).tolist(), 2 * [True] + 3 * [False] + 5 * [True])
+
+        # check special cases
+        length = 5
+
+        logits = self._get_uniform_logits(batch_size=batch_size, length=length)
+        top_k_warp_safety_check = TopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
+
+        scores = top_k_warp_safety_check(input_ids, logits)
+        # uniform dist is not changed
+        self.assertListEqual((scores == 0.0).to(torch.long).sum(dim=-1).tolist(), [0, 0])
+
+        ramp_logits = torch.arange(length, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(batch_size, 1)
+        scores = top_k_warp_safety_check(input_ids, ramp_logits)
+
+        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
+        self.assertListEqual((scores == 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
+
+    def test_top_p_dist_warper(self):
+        input_ids = None
+        vocab_size = 10
+        batch_size = 2
+
+        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
+        dist = torch.log(
+            torch.tensor([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]], device=torch_device, dtype=torch.float)
+        )
+
+        top_p_warp = TopPLogitsWarper(0.7)
+        filtered_dist = torch.exp(top_p_warp(input_ids, dist))
+
+        # dist should be filtered to keep min num values so that sum is >= 0.7
+        # exp (-inf) => 0
+        EXPECTED_FILTERED_DIST = torch.tensor(
+            [[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]], device=torch_device, dtype=torch.float
+        )
+        self.assertTrue(torch.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
+
+        # check edge cases with negative and extreme logits
+        ramp_logits = torch.arange(vocab_size, device=torch_device, dtype=torch.float).unsqueeze(0).repeat(
+            batch_size, 1
+        ) - (vocab_size // 2)
+
+        # make ramp_logits more extreme
+        ramp_logits[1] = ramp_logits[1] * 100.0
+
+        # make sure at least 2 tokens are kept
+        top_p_warp = TopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
+        filtered_dist = top_p_warp(input_ids, ramp_logits)
+
+        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
+        self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [3, 2])
+
+    def test_no_repeat_ngram_dist_processor(self):
+        vocab_size = 3
+        batch_size = 2
+
+        input_ids = torch.tensor([[1, 1, 2, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        no_repeat_proc_2_gram = NoRepeatNGramLogitsProcessor(2)
+        no_repeat_proc_3_gram = NoRepeatNGramLogitsProcessor(3)
+
+        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores.clone())
+        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores.clone())
+
+        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
+        self.assertListEqual(torch.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [True, False, False]])
+
+        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
+        self.assertListEqual(
+            torch.isinf(filtered_scores_3_gram).tolist(), [[False, False, False], [True, False, False]]
+        )
+
+    def test_encoder_no_repeat_ngram_dist_processor(self):
+        vocab_size = 3
+        num_beams = 2
+        batch_size = 1
+
+        encoder_input_ids = torch.tensor([1, 2, 1, 1], device=torch_device, dtype=torch.long)
+
+        input_ids = torch.tensor([[1, 2, 1], [8, 0, 2]], device=torch_device, dtype=torch.long)
+        scores = self._get_uniform_logits(batch_size * num_beams, vocab_size)
+
+        no_repeat_proc_2_gram = EncoderNoRepeatNGramLogitsProcessor(2, encoder_input_ids=encoder_input_ids)
+        no_repeat_proc_3_gram = EncoderNoRepeatNGramLogitsProcessor(3, encoder_input_ids=encoder_input_ids)
+
+        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores.clone())
+        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores.clone())
+
+        # 2-gram would forbid 1st and 2nd token at 1st beam and 1st token (0) at 2nd beam
+        self.assertListEqual(torch.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [False, True, False]])
+
+        # 3-gram would forbid 1st token at 1st beam and no token at 2nd beam
+        self.assertListEqual(
+            torch.isinf(filtered_scores_3_gram).tolist(), [[False, True, False], [False, False, False]]
+        )
+
+        # Batched input
+        vocab_size = 3
+        num_beams = 2
+        batch_size = 2
+        encoder_input_ids = torch.tensor([[1, 2, 1, 1], [0, 0, 2, 1]], device=torch_device, dtype=torch.long)
+
+        input_ids = torch.tensor([[1, 2, 1], [1, 0, 2], [0, 0, 0], [0, 2, 2]], device=torch_device, dtype=torch.long)
+        scores = self._get_uniform_logits(batch_size * num_beams, vocab_size)
+
+        no_repeat_proc_2_gram = EncoderNoRepeatNGramLogitsProcessor(2, encoder_input_ids=encoder_input_ids)
+        no_repeat_proc_3_gram = EncoderNoRepeatNGramLogitsProcessor(3, encoder_input_ids=encoder_input_ids)
+
+        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores.clone())
+        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores.clone())
+
+        # 2gram
+        # Batch 1
+        #   - Beam 1: tokens (1, 2) forbidden
+        #   - Beam 2: tokens (1) forbidden
+        # Batch 2
+        #   - Beam 1: tokens (0, 2) forbidden
+        #   - Beam 2: tokens (1) forbidden
+        self.assertListEqual(
+            torch.isinf(filtered_scores_2_gram).tolist(),
+            [[False, True, True], [False, True, False], [True, False, True], [False, True, False]],
+        )
+
+        # Batch 1
+        #   - Beam 1: tokens (1) forbidden
+        #   - Beam 2: tokens () forbidden
+        # Batch 2
+        #   - Beam 1: tokens (2) forbidden
+        #   - Beam 2: tokens () forbidden
+        self.assertListEqual(
+            torch.isinf(filtered_scores_3_gram).tolist(),
+            [[False, True, False], [False, False, False], [False, False, True], [False, False, False]],
+        )
+
+    def test_no_bad_words_dist_processor(self):
+        vocab_size = 5
+        batch_size = 2
+        eos_token_id = 4
+
+        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
+        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
+
+        filtered_scores = no_bad_words_dist_proc(input_ids, scores.clone())
+
+        # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
+        # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
+        # Note that 5th element cannot be forbidden as it is EOS token
+        self.assertListEqual(
+            torch.isinf(filtered_scores).tolist(), [[True, True, False, True, False], [True, True, True, False, False]]
+        )
+
+        # check edge case
+        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=[[4]], eos_token_id=eos_token_id)
+        filtered_scores = no_bad_words_dist_proc(input_ids, scores.clone())
+        self.assertTrue(torch.allclose(scores, filtered_scores, atol=1e-3))
+
+    def test_processor_list(self):
+        batch_size = 4
+        sequence_length = 10
+        vocab_size = 15
+        eos_token_id = 0
+
+        # dummy input_ids and scores
+        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
+        input_ids_comp = input_ids.clone()
+
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores_comp = scores.clone()
+
+        # instantiate all dist processors
+        min_dist_proc = MinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
+        temp_dist_warp = TemperatureLogitsWarper(temperature=0.5)
+        rep_penalty_proc = RepetitionPenaltyLogitsProcessor(penalty=2.0)
+        top_k_warp = TopKLogitsWarper(3)
+        top_p_warp = TopPLogitsWarper(0.8)
+        no_repeat_proc = NoRepeatNGramLogitsProcessor(2)
+        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
+
+        # no processor list
+        scores = min_dist_proc(input_ids, scores)
+        scores = temp_dist_warp(input_ids, scores)
+        scores = rep_penalty_proc(input_ids, scores)
+        scores = top_k_warp(input_ids, scores)
+        scores = top_p_warp(input_ids, scores)
+        scores = no_repeat_proc(input_ids, scores)
+        scores = no_bad_words_dist_proc(input_ids, scores)
+
+        # with processor list
+        processor = LogitsProcessorList(
+            [
+                min_dist_proc,
+                temp_dist_warp,
+                rep_penalty_proc,
+                top_k_warp,
+                top_p_warp,
+                no_repeat_proc,
+                no_bad_words_dist_proc,
+            ]
+        )
+        scores_comp = processor(input_ids, scores_comp)
+
+        # scores should be equal
+        self.assertTrue(torch.allclose(scores, scores_comp, atol=1e-3))
+
+        # input_ids should never be changed
+        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
+
+    def test_prefix_constrained_logits_processor(self):
+        vocab_size = 5
+        batch_size = 2
+
+        input_ids = torch.tensor([[0, 1, 3, 1], [0, 1, 0, 1]], device=torch_device, dtype=torch.long)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+
+        def prefix_allowed_tokens_fn(batch_id, inputs_ids):
+            return [[0, 1], [2, 3]][batch_id]
+
+        prefix_constrained_logits_proc = PrefixConstrainedLogitsProcessor(prefix_allowed_tokens_fn, 1)
+
+        filtered_scores = prefix_constrained_logits_proc(input_ids, scores.clone())
+
+        # batch 1: 1st, 2nd (0, 1) token are allowed
+        # batch 2: 3rd, 4th (2, 3) token are allowed
+        self.assertListEqual(
+            torch.isinf(filtered_scores).tolist(), [[False, False, True, True, True], [True, True, False, False, True]]
+        )
+
+    def test_hamming_diversity(self):
+        vocab_size = 4
+        num_beams = 2
+        num_beam_groups = 2
+
+        scores = self._get_uniform_logits(num_beams, vocab_size)
+        # batch_idx = 0 -> index batch_idx * num_beam_groups -> idx = 0 * 2 = 0 -> penalises tokens 1
+        # batch_idx = 1 -> index batch_idx * num_beam_groups -> idx = 1 * 2 = 2 -> penalises tokens 1
+        current_tokens = torch.tensor([0, 3, 1, 2], device=torch_device, dtype=torch.long)
+
+        diversity_logits_processor = HammingDiversityLogitsProcessor(
+            diversity_penalty=1.0, num_beams=num_beams, num_beam_groups=num_beam_groups
+        )
+
+        processed_scores = diversity_logits_processor(None, scores, current_tokens, 1)
+
+        self.assertTrue(
+            torch.allclose(
+                processed_scores[0], torch.tensor([-0.7500, 0.2500, 0.2500, 0.2500], device=torch_device), atol=1e-3
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                processed_scores[1], torch.tensor([0.2500, -0.7500, 0.2500, 0.2500], device=torch_device), atol=1e-3
+            )
+        )
+
+    def test_forced_bos_token_logits_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        bos_token_id = 0
+
+        logits_processor = ForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
+
+        # check that all scores are -inf except the bos_token_id score
+        input_ids = ids_tensor((batch_size, 1), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores)
+        self.assertTrue(torch.isneginf(scores[:, bos_token_id + 1 :]).all())
+        self.assertListEqual(scores[:, bos_token_id].tolist(), 4 * [0])  # score for bos_token_id shold be zero
+
+        # check that bos_token_id is not forced if current length is greater than 1
+        input_ids = ids_tensor((batch_size, 4), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores)
+        self.assertFalse(torch.isinf(scores).any())
+
+    def test_forced_eos_token_logits_processor(self):
+        vocab_size = 20
+        batch_size = 4
+        eos_token_id = 0
+        max_length = 5
+
+        logits_processor = ForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
+
+        # check that all scores are -inf except the eos_token_id when max_length is reached
+        input_ids = ids_tensor((batch_size, 4), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores)
+        self.assertTrue(torch.isneginf(scores[:, eos_token_id + 1 :]).all())
+        self.assertListEqual(scores[:, eos_token_id].tolist(), 4 * [0])  # score for eos_token_id should be zero
+
+        # check that eos_token_id is not forced if max_length is not reached
+        input_ids = ids_tensor((batch_size, 3), vocab_size=20)
+        scores = self._get_uniform_logits(batch_size, vocab_size)
+        scores = logits_processor(input_ids, scores)
+        self.assertFalse(torch.isinf(scores).any())
+
+    def test_remove_nan_inf_logits_processor(self):
+        scores = torch.tensor(
+            [[0.0, 0.7, 0.8, float("nan")], [0.1, float("inf"), 0.3, float("-inf")]], device=torch_device
+        )
+        input_ids = ids_tensor((2, 4), vocab_size=20)
+
+        logits_processor = InfNanRemoveLogitsProcessor()
+
+        scores = logits_processor(input_ids, scores)
+
+        self.assertTrue(
+            torch.allclose(
+                scores,
+                torch.tensor(
+                    [[0.0, 0.7, 0.8, 0.0], [0.1, torch.finfo(scores.dtype).max, 0.3, float("-inf")]],
+                    device=torch_device,
+                ),
+                atol=1e-6,
+            )
+        )
diff --git a/test_generation_stopping_criteria.py b/test_generation_stopping_criteria.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3de2c56da1d5d8ca5d1698b906364a9b44e1e8b
--- /dev/null
+++ b/test_generation_stopping_criteria.py
@@ -0,0 +1,94 @@
+import time
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, torch_device
+
+from .test_modeling_common import ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers.generation_stopping_criteria import (
+        MaxLengthCriteria,
+        MaxNewTokensCriteria,
+        MaxTimeCriteria,
+        StoppingCriteriaList,
+        validate_stopping_criteria,
+    )
+
+
+@require_torch
+class StoppingCriteriaTestCase(unittest.TestCase):
+    def _get_tensors(self, length):
+        batch_size = 3
+        vocab_size = 250
+
+        input_ids = ids_tensor((batch_size, length), vocab_size)
+        scores = torch.ones((batch_size, length), device=torch_device, dtype=torch.float) / length
+        return input_ids, scores
+
+    def test_list_criteria(self):
+        input_ids, scores = self._get_tensors(5)
+
+        criteria = StoppingCriteriaList(
+            [
+                MaxLengthCriteria(max_length=10),
+                MaxTimeCriteria(max_time=0.1),
+            ]
+        )
+
+        self.assertFalse(criteria(input_ids, scores))
+
+        input_ids, scores = self._get_tensors(9)
+        self.assertFalse(criteria(input_ids, scores))
+
+        input_ids, scores = self._get_tensors(10)
+        self.assertTrue(criteria(input_ids, scores))
+
+    def test_max_length_criteria(self):
+        criteria = MaxLengthCriteria(max_length=10)
+
+        input_ids, scores = self._get_tensors(5)
+        self.assertFalse(criteria(input_ids, scores))
+
+        input_ids, scores = self._get_tensors(9)
+        self.assertFalse(criteria(input_ids, scores))
+
+        input_ids, scores = self._get_tensors(10)
+        self.assertTrue(criteria(input_ids, scores))
+
+    def test_max_new_tokens_criteria(self):
+        criteria = MaxNewTokensCriteria(start_length=5, max_new_tokens=5)
+
+        input_ids, scores = self._get_tensors(5)
+        self.assertFalse(criteria(input_ids, scores))
+
+        input_ids, scores = self._get_tensors(9)
+        self.assertFalse(criteria(input_ids, scores))
+
+        input_ids, scores = self._get_tensors(10)
+        self.assertTrue(criteria(input_ids, scores))
+
+        criteria_list = StoppingCriteriaList([criteria])
+        self.assertEqual(criteria_list.max_length, 10)
+
+    def test_max_time_criteria(self):
+        input_ids, scores = self._get_tensors(5)
+
+        criteria = MaxTimeCriteria(max_time=0.1)
+        self.assertFalse(criteria(input_ids, scores))
+
+        criteria = MaxTimeCriteria(max_time=0.1, initial_timestamp=time.time() - 0.2)
+        self.assertTrue(criteria(input_ids, scores))
+
+    def test_validate_stopping_criteria(self):
+        validate_stopping_criteria(StoppingCriteriaList([MaxLengthCriteria(10)]), 10)
+
+        with self.assertWarns(UserWarning):
+            validate_stopping_criteria(StoppingCriteriaList([MaxLengthCriteria(10)]), 11)
+
+        stopping_criteria = validate_stopping_criteria(StoppingCriteriaList(), 11)
+
+        self.assertEqual(len(stopping_criteria), 1)
diff --git a/test_generation_utils.py b/test_generation_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..de986b696d8aa0aeddfe880993354754726411e1
--- /dev/null
+++ b/test_generation_utils.py
@@ -0,0 +1,1641 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a clone of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import BartForConditionalGeneration, BartTokenizer, top_k_top_p_filtering
+    from transformers.generation_beam_search import BeamSearchScorer
+    from transformers.generation_logits_process import (
+        ForcedBOSTokenLogitsProcessor,
+        ForcedEOSTokenLogitsProcessor,
+        HammingDiversityLogitsProcessor,
+        InfNanRemoveLogitsProcessor,
+        LogitsProcessorList,
+        MinLengthLogitsProcessor,
+        NoBadWordsLogitsProcessor,
+        NoRepeatNGramLogitsProcessor,
+        RepetitionPenaltyLogitsProcessor,
+        TemperatureLogitsWarper,
+        TopKLogitsWarper,
+        TopPLogitsWarper,
+    )
+    from transformers.generation_stopping_criteria import MaxLengthCriteria, StoppingCriteriaList
+    from transformers.generation_utils import (
+        BeamSampleDecoderOnlyOutput,
+        BeamSampleEncoderDecoderOutput,
+        BeamSearchDecoderOnlyOutput,
+        BeamSearchEncoderDecoderOutput,
+        GreedySearchDecoderOnlyOutput,
+        GreedySearchEncoderDecoderOutput,
+        SampleDecoderOnlyOutput,
+        SampleEncoderDecoderOutput,
+    )
+
+
+class GenerationTesterMixin:
+    model_tester = None
+    all_generative_model_classes = ()
+    input_name = "input_ids"
+
+    def _get_input_ids_and_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        input_ids = inputs_dict[self.input_name]
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+
+        # cut to half length & take max batch_size 3
+        max_batch_size = 2
+        sequence_length = input_ids.shape[-1] // 2
+        input_ids = input_ids[:max_batch_size, :sequence_length]
+        attention_mask = attention_mask[:max_batch_size, :sequence_length]
+
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
+        if config.eos_token_id is not None and config.pad_token_id is None:
+            # hack to allow generate for models such as GPT2 as is done in `generate()`
+            config.pad_token_id = config.eos_token_id
+        return config, input_ids, attention_mask, max_length
+
+    @staticmethod
+    def _get_logits_processor_and_kwargs(
+        input_length,
+        eos_token_id,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        max_length=None,
+        diversity_penalty=None,
+    ):
+        process_kwargs = {
+            "min_length": input_length + 1,
+            "bad_words_ids": [[1, 0]],
+            "no_repeat_ngram_size": 2,
+            "repetition_penalty": 1.2,
+        }
+        logits_processor = LogitsProcessorList(
+            (
+                [
+                    HammingDiversityLogitsProcessor(diversity_penalty, num_beams=2, num_beam_groups=2),
+                ]
+                if diversity_penalty is not None
+                else []
+            )
+            + (
+                [
+                    MinLengthLogitsProcessor(process_kwargs["min_length"], eos_token_id),
+                ]
+                if eos_token_id is not None
+                else []
+            )
+            + (
+                [
+                    ForcedBOSTokenLogitsProcessor(forced_bos_token_id),
+                ]
+                if forced_bos_token_id is not None
+                else []
+            )
+            + (
+                [ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)]
+                if forced_eos_token_id is not None
+                else []
+            )
+            + [
+                NoBadWordsLogitsProcessor(process_kwargs["bad_words_ids"], eos_token_id),
+                NoRepeatNGramLogitsProcessor(process_kwargs["no_repeat_ngram_size"]),
+                RepetitionPenaltyLogitsProcessor(process_kwargs["repetition_penalty"]),
+            ]
+        )
+        return process_kwargs, logits_processor
+
+    @staticmethod
+    def _get_warper_and_kwargs(num_beams):
+        warp_kwargs = {"top_k": 10, "top_p": 0.7, "temperature": 0.7}
+        logits_warper = LogitsProcessorList(
+            [
+                TemperatureLogitsWarper(warp_kwargs["temperature"]),
+                TopKLogitsWarper(top_k=warp_kwargs["top_k"], min_tokens_to_keep=(2 if num_beams > 1 else 1)),
+                TopPLogitsWarper(top_p=warp_kwargs["top_p"], min_tokens_to_keep=(2 if num_beams > 1 else 1)),
+            ]
+        )
+        return warp_kwargs, logits_warper
+
+    @staticmethod
+    def _get_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1):
+        beam_kwargs = {
+            "early_stopping": False,
+            "length_penalty": 2.0,
+            "num_beams": 2,
+            "num_return_sequences": num_return_sequences,
+        }
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=beam_kwargs["num_beams"],
+            device=torch_device,
+            length_penalty=beam_kwargs["length_penalty"],
+            do_early_stopping=beam_kwargs["early_stopping"],
+            num_beam_hyps_to_keep=num_return_sequences,
+        )
+        return beam_kwargs, beam_scorer
+
+    @staticmethod
+    def _get_diverse_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1):
+        beam_kwargs = {
+            "early_stopping": False,
+            "length_penalty": 2.0,
+            "num_beams": 2,
+            "num_return_sequences": num_return_sequences,
+            "num_beam_groups": 2,  # one beam per group
+            "diversity_penalty": 2.0,
+        }
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=beam_kwargs["num_beams"],
+            device=torch_device,
+            length_penalty=beam_kwargs["length_penalty"],
+            do_early_stopping=beam_kwargs["early_stopping"],
+            num_beam_hyps_to_keep=num_return_sequences,
+            num_beam_groups=beam_kwargs["num_beam_groups"],
+        )
+        return beam_kwargs, beam_scorer
+
+    @staticmethod
+    def _get_encoder_outputs(
+        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+    ):
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
+            num_interleave, dim=0
+        )
+        input_ids = torch.zeros_like(input_ids[:, :1]) + model._get_decoder_start_token_id()
+        attention_mask = None
+        return encoder_outputs, input_ids, attention_mask
+
+    def _greedy_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        max_length,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        if model.config.is_encoder_decoder:
+            max_length = 4
+        logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+            input_ids.shape[-1],
+            eos_token_id=model.config.eos_token_id,
+            forced_bos_token_id=model.config.forced_bos_token_id,
+            forced_eos_token_id=model.config.forced_eos_token_id,
+            max_length=max_length,
+        )
+
+        kwargs = {}
+
+        output_generate = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            do_sample=False,
+            num_beams=1,
+            max_length=max_length,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **logits_process_kwargs,
+        )
+
+        if model.config.is_encoder_decoder:
+            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
+                model,
+                input_ids,
+                attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            kwargs["encoder_outputs"] = encoder_outputs
+
+        with torch.no_grad():
+            output_greedy = model.greedy_search(
+                input_ids,
+                max_length=max_length,
+                attention_mask=attention_mask,
+                logits_processor=logits_processor,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
+            )
+        return output_greedy, output_generate
+
+    def _sample_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        max_length,
+        num_return_sequences,
+        logits_processor,
+        logits_warper,
+        logits_warper_kwargs,
+        process_kwargs,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        torch.manual_seed(0)
+        output_generate = model.generate(
+            input_ids,
+            do_sample=True,
+            num_beams=1,
+            max_length=max_length,
+            num_return_sequences=num_return_sequences,
+            attention_mask=attention_mask,
+            output_scores=output_scores,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **logits_warper_kwargs,
+            **process_kwargs,
+        )
+
+        torch.manual_seed(0)
+        kwargs = {}
+        if model.config.is_encoder_decoder:
+            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+                model,
+                input_ids,
+                attention_mask,
+                num_interleave=num_return_sequences,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            kwargs["encoder_outputs"] = encoder_outputs
+            input_ids_clone = input_ids_clone.repeat_interleave(num_return_sequences, dim=0)
+        else:
+            attention_mask_clone = attention_mask.repeat_interleave(num_return_sequences, dim=0)
+            input_ids_clone = input_ids.repeat_interleave(num_return_sequences, dim=0)
+
+        # prevent flaky generation test failures
+        logits_processor.append(InfNanRemoveLogitsProcessor())
+
+        with torch.no_grad():
+            output_sample = model.sample(
+                input_ids_clone,
+                attention_mask=attention_mask_clone,
+                max_length=max_length,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
+            )
+        return output_sample, output_generate
+
+    def _beam_search_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        max_length,
+        beam_scorer,
+        beam_kwargs,
+        logits_processor,
+        logits_process_kwargs,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        output_generate = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            do_sample=False,
+            max_length=max_length,
+            output_scores=output_scores,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **beam_kwargs,
+            **logits_process_kwargs,
+        )
+
+        # beam_search does not automatically interleave `batch_size` dim for `num_beams`
+        kwargs = {}
+        if model.config.is_encoder_decoder:
+            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+                model,
+                input_ids,
+                attention_mask,
+                num_interleave=beam_scorer.num_beams,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            kwargs["encoder_outputs"] = encoder_outputs
+            input_ids_clone = input_ids_clone.repeat_interleave(beam_scorer.num_beams, dim=0)
+        else:
+            attention_mask_clone = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
+            input_ids_clone = input_ids.repeat_interleave(beam_scorer.num_beams, dim=0)
+
+        with torch.no_grad():
+            output_beam_search = model.beam_search(
+                input_ids_clone,
+                beam_scorer,
+                max_length=max_length,
+                attention_mask=attention_mask_clone,
+                logits_processor=logits_processor,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
+            )
+        return output_generate, output_beam_search
+
+    def _beam_sample_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        max_length,
+        num_return_sequences,
+        beam_scorer,
+        beam_kwargs,
+        logits_warper,
+        logits_warper_kwargs,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        torch.manual_seed(0)
+        output_generate = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            do_sample=True,
+            max_length=max_length,
+            output_scores=output_scores,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **beam_kwargs,
+            **logits_warper_kwargs,
+        )
+        # beam_search does not automatically interleave `batch_size` dim for `num_beams * num_return_sequences`
+        kwargs = {}
+        if model.config.is_encoder_decoder:
+            encoder_outputs, input_ids, attention_mask = self._get_encoder_outputs(
+                model,
+                input_ids,
+                attention_mask,
+                num_interleave=beam_scorer.num_beams * num_return_sequences,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            kwargs["encoder_outputs"] = encoder_outputs
+        else:
+            attention_mask = attention_mask.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0)
+
+        # prevent flaky generation test failures
+        logits_processor = LogitsProcessorList()
+        logits_processor.append(InfNanRemoveLogitsProcessor())
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output_beam_sample = model.beam_sample(
+                input_ids.repeat_interleave(beam_scorer.num_beams * num_return_sequences, dim=0),
+                beam_scorer,
+                max_length=max_length,
+                attention_mask=attention_mask,
+                logits_warper=logits_warper,
+                logits_processor=logits_processor,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
+            )
+
+        return output_generate, output_beam_sample
+
+    def _group_beam_search_generate(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        max_length,
+        beam_scorer,
+        beam_kwargs,
+        logits_processor,
+        logits_process_kwargs,
+        output_scores=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict_in_generate=False,
+    ):
+        output_generate = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            do_sample=False,
+            max_length=max_length,
+            output_scores=output_scores,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict_in_generate=return_dict_in_generate,
+            remove_invalid_values=True,
+            **beam_kwargs,
+            **logits_process_kwargs,
+        )
+
+        # group_beam_search does not automatically interleave `batch_size` dim for `num_beams`
+        kwargs = {}
+        if model.config.is_encoder_decoder:
+            encoder_outputs, input_ids_clone, attention_mask_clone = self._get_encoder_outputs(
+                model,
+                input_ids,
+                attention_mask,
+                num_interleave=beam_scorer.num_beams,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            kwargs["encoder_outputs"] = encoder_outputs
+            input_ids_clone = input_ids_clone.repeat_interleave(beam_scorer.num_beams, dim=0)
+        else:
+            attention_mask_clone = attention_mask.repeat_interleave(beam_scorer.num_beams, dim=0)
+            input_ids_clone = input_ids.repeat_interleave(beam_scorer.num_beams, dim=0)
+
+        with torch.no_grad():
+            output_group_beam_search = model.group_beam_search(
+                input_ids_clone,
+                beam_scorer,
+                max_length=max_length,
+                attention_mask=attention_mask_clone,
+                logits_processor=logits_processor,
+                output_scores=output_scores,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
+            )
+        return output_generate, output_group_beam_search
+
+    def test_greedy_generate(self):
+        # check `generate()` and `greedy_search()` are equal
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            # test old generation output for backwards compatibility
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length
+            )
+            self.assertListEqual(output_greedy.tolist(), output_generate.tolist())
+
+    def test_greedy_generate_dict_outputs(self):
+        for model_class in self.all_generative_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_greedy, GreedySearchEncoderDecoderOutput)
+                self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_greedy, GreedySearchDecoderOnlyOutput)
+                self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_greedy.sequences.tolist())
+
+            for output in (output_greedy, output_generate):
+                self._check_outputs(output, input_ids, model.config)
+
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        for model_class in self.all_generative_model_classes:
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            if not hasattr(config, "use_cache"):
+                # only relevant if model has "use_cache"
+                return
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_greedy, output_generate = self._greedy_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_greedy.sequences.tolist())
+
+            for output in (output_greedy, output_generate):
+                self._check_outputs(output, input_ids, model.config, use_cache=True)
+
+    def test_sample_generate(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device).eval()
+
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            # check `generate()` and `sample()` are equal
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=1,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+            )
+            self.assertListEqual(output_sample.tolist(), output_generate.tolist())
+
+            # check `generate()` and `sample()` yield equal results for `num_return_sequences`
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=3,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+            )
+            self.assertListEqual(output_sample.tolist(), output_generate.tolist())
+
+    def test_sample_generate_dict_output(self):
+        for model_class in self.all_generative_model_classes:
+            # disable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+            model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                model.config.eos_token_id,
+                forced_bos_token_id=model.config.forced_bos_token_id,
+                forced_eos_token_id=model.config.forced_eos_token_id,
+                max_length=max_length,
+            )
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            output_sample, output_generate = self._sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=2,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                process_kwargs=process_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_sample, SampleEncoderDecoderOutput)
+                self.assertIsInstance(output_generate, SampleEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_sample, SampleDecoderOnlyOutput)
+                self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_sample.sequences.tolist())
+
+            for output in (output_sample, output_generate):
+                self._check_outputs(output, input_ids, model.config, num_return_sequences=2)
+
+    def test_beam_search_generate(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
+
+            # check `generate()` and `beam_search()` are equal
+            output_generate, output_beam_search = self._beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
+            )
+            self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
+
+            # check `generate()` and `beam_search()` are equal for `num_return_sequences`
+            num_return_sequences = 2
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
+            )
+
+            output_generate, output_beam_search = self._beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
+            )
+            self.assertListEqual(output_generate.tolist(), output_beam_search.tolist())
+
+    def test_beam_search_generate_dict_output(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # disable cache
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
+            output_generate, output_beam_search = self._beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_beam_search, BeamSearchEncoderDecoderOutput)
+                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_beam_search, BeamSearchDecoderOnlyOutput)
+                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_beam_search.sequences.tolist())
+            self.assertTrue(
+                torch.allclose(output_generate["sequences_scores"], output_beam_search["sequences_scores"], atol=1e-3)
+            )
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
+            for output in (output_beam_search, output_generate):
+                self._check_outputs(output, input_ids, model.config, num_return_sequences=beam_scorer.num_beams)
+
+    def test_beam_search_generate_dict_outputs_use_cache(self):
+        for model_class in self.all_generative_model_classes:
+            # enable cache
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            if not hasattr(config, "use_cache"):
+                # only relevant if model has "use_cache"
+                return
+
+            model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+            )
+
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
+
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            output_beam, output_generate = self._beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_process_kwargs=logits_process_kwargs,
+                logits_processor=logits_processor,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_beam.sequences.tolist())
+
+            for output in (output_beam, output_generate):
+                self._check_outputs(
+                    output, input_ids, model.config, use_cache=True, num_return_sequences=beam_scorer.num_beams
+                )
+
+    def test_beam_sample_generate(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            model = model_class(config).to(torch_device).eval()
+
+            # check `generate()` and `beam_search()` are equal
+            # change `num_return_sequences = 2` but not for `beam_scorer`
+            num_return_sequences = 2
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
+                input_ids.shape[0] * num_return_sequences, max_length
+            )
+            beam_kwargs["num_return_sequences"] = num_return_sequences
+
+            output_generate, output_beam_sample = self._beam_sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=num_return_sequences,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+            )
+            self.assertListEqual(output_generate.tolist(), output_beam_sample.tolist())
+
+    def test_beam_sample_generate_dict_output(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # disable cache
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            model = model_class(config).to(torch_device).eval()
+            logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
+
+            num_return_sequences = 2
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_beam_scorer_and_kwargs(
+                input_ids.shape[0] * num_return_sequences, max_length
+            )
+            beam_kwargs["num_return_sequences"] = num_return_sequences
+
+            output_beam_sample, output_generate = self._beam_sample_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                num_return_sequences=num_return_sequences,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_warper=logits_warper,
+                logits_warper_kwargs=logits_warper_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_beam_sample, BeamSampleEncoderDecoderOutput)
+                self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_beam_sample, BeamSampleDecoderOnlyOutput)
+                self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_beam_sample.sequences.tolist())
+            self.assertTrue(
+                torch.allclose(output_generate["sequences_scores"], output_beam_sample["sequences_scores"], atol=1e-3)
+            )
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
+            for output in (output_beam_sample, output_generate):
+                self._check_outputs(
+                    output, input_ids, model.config, num_return_sequences=num_return_sequences * beam_scorer.num_beams
+                )
+
+    def test_generate_without_input_ids(self):
+        config, _, _, max_length = self._get_input_ids_and_config()
+
+        # if no bos token id => cannot generate from None
+        if config.bos_token_id is None:
+            return
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+
+            output_ids_generate = model.generate(
+                do_sample=False,
+                max_length=max_length,
+                remove_invalid_values=True,
+            )
+
+            self.assertIsNotNone(output_ids_generate)
+
+    def test_group_beam_search_generate(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+                diversity_penalty=2.0,
+            )
+
+            # check `generate()` and `group_beam_search()` are equal
+            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(input_ids.shape[0], max_length)
+            output_generate, output_group_beam_search = self._group_beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
+            )
+            self.assertListEqual(output_generate.tolist(), output_group_beam_search.tolist())
+
+            # check `generate()` and `group_beam_search()` are equal for `num_return_sequences`
+            num_return_sequences = 2
+            if model.config.is_encoder_decoder:
+                max_length = 4
+            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
+            )
+            output_generate, output_group_beam_search = self._group_beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
+            )
+            self.assertListEqual(output_generate.tolist(), output_group_beam_search.tolist())
+
+    def test_group_beam_search_generate_dict_output(self):
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            config.use_cache = False
+
+            # It is important set set the eos_token_id to None to ensure that no sequences
+            # shorter than `max_length` can be generated which could lead to flaky circle ci
+            # failures if the top `num_return_sequences` beams are all shorter than the longest beam
+            config.eos_token_id = None
+            config.forced_eos_token_id = None
+
+            model = model_class(config).to(torch_device).eval()
+            if model.config.is_encoder_decoder:
+                max_length = 4
+
+            logits_process_kwargs, logits_processor = self._get_logits_processor_and_kwargs(
+                input_ids.shape[-1],
+                config.eos_token_id,
+                config.forced_bos_token_id,
+                config.forced_eos_token_id,
+                max_length,
+                diversity_penalty=2.0,
+            )
+
+            num_return_sequences = 1
+            beam_kwargs, beam_scorer = self._get_diverse_beam_scorer_and_kwargs(
+                input_ids.shape[0], max_length, num_return_sequences=num_return_sequences
+            )
+            output_generate, output_group_beam_search = self._group_beam_search_generate(
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_length=max_length,
+                beam_scorer=beam_scorer,
+                beam_kwargs=beam_kwargs,
+                logits_processor=logits_processor,
+                logits_process_kwargs=logits_process_kwargs,
+                output_scores=True,
+                output_hidden_states=True,
+                output_attentions=True,
+                return_dict_in_generate=True,
+            )
+            if model.config.is_encoder_decoder:
+                self.assertIsInstance(output_group_beam_search, BeamSearchEncoderDecoderOutput)
+                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
+            else:
+                self.assertIsInstance(output_group_beam_search, BeamSearchDecoderOnlyOutput)
+                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
+
+            self.assertListEqual(output_generate.sequences.tolist(), output_group_beam_search.sequences.tolist())
+            self.assertTrue(
+                torch.allclose(
+                    output_generate["sequences_scores"], output_group_beam_search["sequences_scores"], atol=1e-3
+                )
+            )
+            self.assertTrue(output_generate["sequences_scores"].shape == (output_generate["sequences"].shape[0],))
+            self.assertTrue((output_generate["sequences_scores"] < 0).all().item())
+
+            for output in (output_group_beam_search, output_generate):
+                self._check_outputs(
+                    output, input_ids, model.config, num_return_sequences=num_return_sequences * beam_scorer.num_beams
+                )
+
+    def test_generate_with_head_masking(self):
+        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        for model_class in self.all_generative_model_classes:
+            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            model = model_class(config).to(torch_device)
+            # We want to test only encoder-decoder models
+            if not config.is_encoder_decoder:
+                continue
+
+            head_masking = {
+                "head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
+                "decoder_head_mask": torch.zeros(
+                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
+                ),
+                "cross_attn_head_mask": torch.zeros(
+                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
+                ),
+            }
+
+            signature = inspect.signature(model.forward)
+            # We want to test only models where encoder/decoder head masking is implemented
+            if not set(head_masking.keys()) < set([*signature.parameters.keys()]):
+                continue
+
+            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+                out = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    num_beams=1,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                    remove_invalid_values=True,
+                    **{name: mask},
+                )
+                # We check the state of decoder_attentions and cross_attentions just from the last step
+                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
+    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
+        batch_size, seq_length = input_ids.shape
+        num_sequences_in_output = batch_size * num_return_sequences
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+
+        # Attentions
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+            # decoder
+            self._check_attentions_for_generate(
+                num_sequences_in_output,
+                output.decoder_attentions,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            attentions = output.attentions if not use_cache else output.attentions[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_attentions_for_generate(
+                num_sequences_in_output,
+                attentions=attentions,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+    def _check_scores(self, batch_size, scores, length, config):
+        expected_shape = (batch_size, config.vocab_size)
+        self.assertIsInstance(scores, tuple)
+        self.assertEqual(len(scores), length)
+        self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx if not use_cache else 1
+            src_len = min_length + idx
+
+            expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+            # check attn size
+            self.assertListEqual(
+                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
+            )
+
+    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
+        encoder_expected_shape = (batch_size, config.num_attention_heads, seq_length, seq_length)
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [layer_attentions.shape for layer_attentions in attentions],
+            [encoder_expected_shape] * len(attentions),
+        )
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            seq_len = min_length + idx if not use_cache else 1
+            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+            # check hidden size
+            self.assertListEqual(
+                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
+                [expected_shape] * len(iter_hidden_states),
+            )
+
+    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
+        encoder_expected_shape = (batch_size, seq_length, config.hidden_size)
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
+            [encoder_expected_shape] * len(hidden_states),
+        )
+
+
+@require_torch
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = torch.tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 4 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 4 highest values <= 0.6
+            ],
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        non_inf_expected_idx = torch.tensor(
+            [[0, 0], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 20], [1, 27]],
+            dtype=torch.long,
+            device=torch_device,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = torch.tensor(
+            [
+                8.2221,
+                8.4321,
+                7.4402,
+                9.3845,
+                6.2712,
+                8.8275,
+                7.3858,
+                9.6770,
+            ],  # expected non filtered values as noted above
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        output = top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+        non_inf_output = output[output != -float("inf")].to(device=torch_device)
+        non_inf_idx = (output != -float("inf")).nonzero().to(device=torch_device)
+
+        self.assertTrue(torch.allclose(non_inf_expected_output, non_inf_output, atol=1e-12))
+        self.assertTrue(torch.all(torch.eq(non_inf_expected_idx, non_inf_idx)))
+
+
+@require_torch
+class GenerationIntegrationTests(unittest.TestCase):
+    @slow
+    def test_diverse_beam_search(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
+        The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
+        "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
+        The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both."""
+
+        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        outputs = bart_model.generate(
+            input_ids,
+            num_beams=4,
+            num_return_sequences=2,
+            num_beam_groups=4,
+            diversity_penalty=2.0,
+            remove_invalid_values=True,
+        )
+
+        generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+        self.assertListEqual(
+            generated_text,
+            [
+                "The couple announced the birth of their son, Silas Randall Timberlake, in a statement. Silas was the middle name of Timberlake's maternal grandfather Bill Bomar. Randall is the musician's own middle name, as well as his father's first. It is the first baby for both of them.",
+                "Justin Timberlake and Jessica Biel have a son. The baby is named Silas Randall Timberlake. It is the first child for both. The couple announced the pregnancy in January. The name Silas is the middle name of Timberlake's maternal grandfather. It's also his own middle name.",
+            ],
+        )
+
+    def test_max_length_backward_compat_greedy(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        max_length = 20
+        input_ids = input_ids.expand(2, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
+            input_ids,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+
+        with self.assertWarns(UserWarning):
+            bart_model.greedy_search(
+                input_ids,
+                max_length=max_length,
+                pad_token_id=bart_model.config.pad_token_id,
+                eos_token_id=bart_model.config.eos_token_id,
+                **model_kwargs,
+            )
+
+    def test_max_length_backward_compat_sample(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        max_length = 20
+        input_ids = input_ids.expand(2, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
+            input_ids,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+        with torch.no_grad():
+            with self.assertWarns(UserWarning):
+                bart_model.sample(
+                    input_ids,
+                    max_length=max_length,
+                    pad_token_id=bart_model.config.pad_token_id,
+                    eos_token_id=bart_model.config.eos_token_id,
+                    **model_kwargs,
+                )
+
+    def test_max_length_backward_compat_beam_search(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        batch_size = 1
+        max_length = 20
+        num_beams = 2
+
+        input_ids = input_ids.expand(2, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
+            input_ids,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+        )
+        with self.assertWarns(UserWarning):
+            _ = bart_model.beam_search(
+                input_ids, num_beams=num_beams, max_length=max_length, beam_scorer=beam_scorer, **model_kwargs
+            )
+
+    def test_max_length_backward_compat_group_beam_search(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        batch_size = 1
+        max_length = 20
+        num_beams = 6
+        num_beam_groups = 3
+        num_return_sequences = num_beams * batch_size
+
+        input_ids = input_ids.expand(6, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
+            input_ids,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+
+        diverse_beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+            num_beam_hyps_to_keep=num_return_sequences,
+            num_beam_groups=num_beam_groups,
+        )
+        with self.assertWarns(UserWarning):
+            bart_model.group_beam_search(
+                input_ids, diverse_beam_scorer, num_beams=num_beams, max_length=max_length, **model_kwargs
+            )
+
+    def test_max_length_warning_if_different(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        batch_size = 1
+
+        max_length = 20
+        num_beams = 6
+        num_beam_groups = 3
+        num_return_sequences = num_beams * batch_size
+        stopping_criteria_max_length = 18
+        stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=stopping_criteria_max_length)])
+
+        # Greedy
+        input_ids = input_ids.expand(6, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+        input_ids = bart_model._prepare_decoder_input_ids_for_generation(
+            input_ids,
+            decoder_start_token_id=bart_model.config.decoder_start_token_id,
+            bos_token_id=bart_model.config.bos_token_id,
+        )
+
+        with self.assertWarns(UserWarning):
+            bart_model.greedy_search(
+                input_ids,
+                max_length=max_length,
+                pad_token_id=bart_model.config.pad_token_id,
+                stopping_criteria=stopping_criteria,
+                eos_token_id=bart_model.config.eos_token_id,
+                **model_kwargs,
+            )
+
+        # Sample
+        with self.assertWarns(UserWarning):
+            with torch.no_grad():
+                bart_model.sample(
+                    input_ids,
+                    max_length=max_length,
+                    stopping_criteria=stopping_criteria,
+                    pad_token_id=bart_model.config.pad_token_id,
+                    eos_token_id=bart_model.config.eos_token_id,
+                    **model_kwargs,
+                )
+
+        # Beam
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+        )
+        with self.assertWarns(UserWarning):
+            with torch.no_grad():
+                bart_model.beam_search(
+                    input_ids,
+                    num_beams=num_beams,
+                    stopping_criteria=stopping_criteria,
+                    max_length=max_length,
+                    beam_scorer=beam_scorer,
+                    **model_kwargs,
+                )
+
+        # Grouped beam search
+        diverse_beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+            num_beam_hyps_to_keep=num_return_sequences,
+            num_beam_groups=num_beam_groups,
+        )
+        with self.assertWarns(UserWarning):
+            bart_model.group_beam_search(
+                input_ids,
+                diverse_beam_scorer,
+                stopping_criteria=stopping_criteria,
+                num_beams=num_beams,
+                max_length=max_length,
+                **model_kwargs,
+            )
+
+    def test_beam_search_warning_if_max_length_is_passed(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+
+        batch_size = 1
+        num_beams = 3
+
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+        input_ids = input_ids.expand(num_beams, -1)
+        model_kwargs = bart_model._prepare_encoder_decoder_kwargs_for_generation(input_ids, {})
+
+        stopping_criteria_max_length = 18
+        stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=stopping_criteria_max_length)])
+
+        with self.assertWarns(UserWarning):
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=num_beams,
+                device=torch_device,
+                max_length=10,
+            )
+
+        generated_ids = bart_model.beam_search(
+            input_ids,
+            num_beams=num_beams,
+            stopping_criteria=stopping_criteria,
+            beam_scorer=beam_scorer,
+            **model_kwargs,
+        )
+
+        beam_scorer_no_max_len = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=torch_device,
+        )
+
+        generated_ids_no_max_len = bart_model.beam_search(
+            input_ids,
+            num_beams=num_beams,
+            stopping_criteria=stopping_criteria,
+            beam_scorer=beam_scorer_no_max_len,
+            **model_kwargs,
+        )
+
+        # BeamSearchScorer max_length should not influence "real" max_length
+        self.assertEqual(generated_ids.tolist(), generated_ids_no_max_len.tolist())
+
+    def test_max_new_tokens(self):
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
+        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
+
+        self.assertEqual(list(input_ids.shape), [1, 15])
+
+        # Encoder decoder call
+        max_new_tokens = 3
+        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
+        # 1 BOS + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 4])
+
+        # Decoder only call
+        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
+        # 15 + 3 new tokens
+        self.assertEqual(list(outputs.shape), [1, 18])
+
+        # max_new_tokens and max_length serve the same purpose and should not be used together.
+        with self.assertWarns(UserWarning):
+            outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=10, max_length=20)
diff --git a/test_hf_api.py b/test_hf_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7b1ddc868ab77e1dbdd4e6c01404277a29715d
--- /dev/null
+++ b/test_hf_api.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import shutil
+import subprocess
+import time
+import unittest
+
+from requests.exceptions import HTTPError
+from transformers.hf_api import HfApi, HfFolder, ModelInfo, RepoObj
+from transformers.testing_utils import ENDPOINT_STAGING, PASS, USER, is_staging_test, require_git_lfs
+
+
+ENDPOINT_STAGING_BASIC_AUTH = f"https://{USER}:{PASS}@moon-staging.huggingface.co"
+
+REPO_NAME = f"my-model-{int(time.time())}"
+REPO_NAME_LARGE_FILE = f"my-model-largefiles-{int(time.time())}"
+WORKING_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/working_repo")
+LARGE_FILE_14MB = "https://cdn-media.huggingface.co/lfs-largefiles/progit.epub"
+LARGE_FILE_18MB = "https://cdn-media.huggingface.co/lfs-largefiles/progit.pdf"
+
+
+class HfApiCommonTest(unittest.TestCase):
+    _api = HfApi(endpoint=ENDPOINT_STAGING)
+
+
+class HfApiLoginTest(HfApiCommonTest):
+    def test_login_invalid(self):
+        with self.assertRaises(HTTPError):
+            self._api.login(username=USER, password="fake")
+
+    def test_login_valid(self):
+        token = self._api.login(username=USER, password=PASS)
+        self.assertIsInstance(token, str)
+
+
+class HfApiEndpointsTest(HfApiCommonTest):
+    @classmethod
+    def setUpClass(cls):
+        """
+        Share this valid token in all tests below.
+        """
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    def test_whoami(self):
+        user, orgs = self._api.whoami(token=self._token)
+        self.assertEqual(user, USER)
+        self.assertIsInstance(orgs, list)
+
+    def test_list_repos_objs(self):
+        objs = self._api.list_repos_objs(token=self._token)
+        self.assertIsInstance(objs, list)
+        if len(objs) > 0:
+            o = objs[-1]
+            self.assertIsInstance(o, RepoObj)
+
+    def test_create_and_delete_repo(self):
+        self._api.create_repo(token=self._token, name=REPO_NAME)
+        self._api.delete_repo(token=self._token, name=REPO_NAME)
+
+
+class HfApiPublicTest(unittest.TestCase):
+    def test_staging_model_list(self):
+        _api = HfApi(endpoint=ENDPOINT_STAGING)
+        _ = _api.model_list()
+
+    def test_model_list(self):
+        _api = HfApi()
+        models = _api.model_list()
+        self.assertGreater(len(models), 100)
+        self.assertIsInstance(models[0], ModelInfo)
+
+
+class HfFolderTest(unittest.TestCase):
+    def test_token_workflow(self):
+        """
+        Test the whole token save/get/delete workflow,
+        with the desired behavior with respect to non-existent tokens.
+        """
+        token = f"token-{int(time.time())}"
+        HfFolder.save_token(token)
+        self.assertEqual(HfFolder.get_token(), token)
+        HfFolder.delete_token()
+        HfFolder.delete_token()
+        # ^^ not an error, we test that the
+        # second call does not fail.
+        self.assertEqual(HfFolder.get_token(), None)
+
+
+@require_git_lfs
+@is_staging_test
+class HfLargefilesTest(HfApiCommonTest):
+    @classmethod
+    def setUpClass(cls):
+        """
+        Share this valid token in all tests below.
+        """
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    def setUp(self):
+        try:
+            shutil.rmtree(WORKING_REPO_DIR)
+        except FileNotFoundError:
+            pass
+
+    def tearDown(self):
+        self._api.delete_repo(token=self._token, name=REPO_NAME_LARGE_FILE)
+
+    def setup_local_clone(self, REMOTE_URL):
+        REMOTE_URL_AUTH = REMOTE_URL.replace(ENDPOINT_STAGING, ENDPOINT_STAGING_BASIC_AUTH)
+        subprocess.run(["git", "clone", REMOTE_URL_AUTH, WORKING_REPO_DIR], check=True, capture_output=True)
+        subprocess.run(["git", "lfs", "track", "*.pdf"], check=True, cwd=WORKING_REPO_DIR)
+        subprocess.run(["git", "lfs", "track", "*.epub"], check=True, cwd=WORKING_REPO_DIR)
+
+    def test_end_to_end_thresh_6M(self):
+        REMOTE_URL = self._api.create_repo(
+            token=self._token, name=REPO_NAME_LARGE_FILE, lfsmultipartthresh=6 * 10 ** 6
+        )
+        self.setup_local_clone(REMOTE_URL)
+
+        subprocess.run(["wget", LARGE_FILE_18MB], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
+        subprocess.run(["git", "add", "*"], check=True, cwd=WORKING_REPO_DIR)
+        subprocess.run(["git", "commit", "-m", "commit message"], check=True, cwd=WORKING_REPO_DIR)
+
+        # This will fail as we haven't set up our custom transfer agent yet.
+        failed_process = subprocess.run(["git", "push"], capture_output=True, cwd=WORKING_REPO_DIR)
+        self.assertEqual(failed_process.returncode, 1)
+        self.assertIn("transformers-cli lfs-enable-largefiles", failed_process.stderr.decode())
+        # ^ Instructions on how to fix this are included in the error message.
+
+        subprocess.run(["transformers-cli", "lfs-enable-largefiles", WORKING_REPO_DIR], check=True)
+
+        start_time = time.time()
+        subprocess.run(["git", "push"], check=True, cwd=WORKING_REPO_DIR)
+        print("took", time.time() - start_time)
+
+        # To be 100% sure, let's download the resolved file
+        pdf_url = f"{REMOTE_URL}/resolve/main/progit.pdf"
+        DEST_FILENAME = "uploaded.pdf"
+        subprocess.run(["wget", pdf_url, "-O", DEST_FILENAME], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
+        dest_filesize = os.stat(os.path.join(WORKING_REPO_DIR, DEST_FILENAME)).st_size
+        self.assertEqual(dest_filesize, 18685041)
+
+    def test_end_to_end_thresh_16M(self):
+        # Here we'll push one multipart and one non-multipart file in the same commit, and see what happens
+        REMOTE_URL = self._api.create_repo(
+            token=self._token, name=REPO_NAME_LARGE_FILE, lfsmultipartthresh=16 * 10 ** 6
+        )
+        self.setup_local_clone(REMOTE_URL)
+
+        subprocess.run(["wget", LARGE_FILE_18MB], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
+        subprocess.run(["wget", LARGE_FILE_14MB], check=True, capture_output=True, cwd=WORKING_REPO_DIR)
+        subprocess.run(["git", "add", "*"], check=True, cwd=WORKING_REPO_DIR)
+        subprocess.run(["git", "commit", "-m", "both files in same commit"], check=True, cwd=WORKING_REPO_DIR)
+
+        subprocess.run(["transformers-cli", "lfs-enable-largefiles", WORKING_REPO_DIR], check=True)
+
+        start_time = time.time()
+        subprocess.run(["git", "push"], check=True, cwd=WORKING_REPO_DIR)
+        print("took", time.time() - start_time)
diff --git a/test_hf_argparser.py b/test_hf_argparser.py
new file mode 100644
index 0000000000000000000000000000000000000000..787990b866595be75a97cb9cb708925516f59667
--- /dev/null
+++ b/test_hf_argparser.py
@@ -0,0 +1,224 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import unittest
+from argparse import Namespace
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Optional
+
+from transformers import HfArgumentParser, TrainingArguments
+from transformers.hf_argparser import string_to_bool
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+@dataclass
+class BasicExample:
+    foo: int
+    bar: float
+    baz: str
+    flag: bool
+
+
+@dataclass
+class WithDefaultExample:
+    foo: int = 42
+    baz: str = field(default="toto", metadata={"help": "help message"})
+
+
+@dataclass
+class WithDefaultBoolExample:
+    foo: bool = False
+    baz: bool = True
+    opt: Optional[bool] = None
+
+
+class BasicEnum(Enum):
+    titi = "titi"
+    toto = "toto"
+
+
+@dataclass
+class EnumExample:
+    foo: BasicEnum = "toto"
+
+    def __post_init__(self):
+        self.foo = BasicEnum(self.foo)
+
+
+@dataclass
+class OptionalExample:
+    foo: Optional[int] = None
+    bar: Optional[float] = field(default=None, metadata={"help": "help message"})
+    baz: Optional[str] = None
+    ces: Optional[List[str]] = list_field(default=[])
+    des: Optional[List[int]] = list_field(default=[])
+
+
+@dataclass
+class ListExample:
+    foo_int: List[int] = list_field(default=[])
+    bar_int: List[int] = list_field(default=[1, 2, 3])
+    foo_str: List[str] = list_field(default=["Hallo", "Bonjour", "Hello"])
+    foo_float: List[float] = list_field(default=[0.1, 0.2, 0.3])
+
+
+@dataclass
+class RequiredExample:
+    required_list: List[int] = field()
+    required_str: str = field()
+    required_enum: BasicEnum = field()
+
+    def __post_init__(self):
+        self.required_enum = BasicEnum(self.required_enum)
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    def argparsersEqual(self, a: argparse.ArgumentParser, b: argparse.ArgumentParser) -> bool:
+        """
+        Small helper to check pseudo-equality of parsed arguments on `ArgumentParser` instances.
+        """
+        self.assertEqual(len(a._actions), len(b._actions))
+        for x, y in zip(a._actions, b._actions):
+            xx = {k: v for k, v in vars(x).items() if k != "container"}
+            yy = {k: v for k, v in vars(y).items() if k != "container"}
+            self.assertEqual(xx, yy)
+
+    def test_basic(self):
+        parser = HfArgumentParser(BasicExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--foo", type=int, required=True)
+        expected.add_argument("--bar", type=float, required=True)
+        expected.add_argument("--baz", type=str, required=True)
+        expected.add_argument("--flag", type=string_to_bool, default=True, const=True, nargs="?")
+        self.argparsersEqual(parser, expected)
+
+    def test_with_default(self):
+        parser = HfArgumentParser(WithDefaultExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--foo", default=42, type=int)
+        expected.add_argument("--baz", default="toto", type=str, help="help message")
+        self.argparsersEqual(parser, expected)
+
+    def test_with_default_bool(self):
+        parser = HfArgumentParser(WithDefaultBoolExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?")
+        expected.add_argument("--no_baz", action="store_false", dest="baz")
+        expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?")
+        expected.add_argument("--opt", type=string_to_bool, default=None)
+        self.argparsersEqual(parser, expected)
+
+        args = parser.parse_args([])
+        self.assertEqual(args, Namespace(foo=False, baz=True, opt=None))
+
+        args = parser.parse_args(["--foo", "--no_baz"])
+        self.assertEqual(args, Namespace(foo=True, baz=False, opt=None))
+
+        args = parser.parse_args(["--foo", "--baz"])
+        self.assertEqual(args, Namespace(foo=True, baz=True, opt=None))
+
+        args = parser.parse_args(["--foo", "True", "--baz", "True", "--opt", "True"])
+        self.assertEqual(args, Namespace(foo=True, baz=True, opt=True))
+
+        args = parser.parse_args(["--foo", "False", "--baz", "False", "--opt", "False"])
+        self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
+
+    def test_with_enum(self):
+        parser = HfArgumentParser(EnumExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--foo", default="toto", choices=["titi", "toto"], type=str)
+        self.argparsersEqual(parser, expected)
+
+        args = parser.parse_args([])
+        self.assertEqual(args.foo, "toto")
+        enum_ex = parser.parse_args_into_dataclasses([])[0]
+        self.assertEqual(enum_ex.foo, BasicEnum.toto)
+
+        args = parser.parse_args(["--foo", "titi"])
+        self.assertEqual(args.foo, "titi")
+        enum_ex = parser.parse_args_into_dataclasses(["--foo", "titi"])[0]
+        self.assertEqual(enum_ex.foo, BasicEnum.titi)
+
+    def test_with_list(self):
+        parser = HfArgumentParser(ListExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--foo_int", nargs="+", default=[], type=int)
+        expected.add_argument("--bar_int", nargs="+", default=[1, 2, 3], type=int)
+        expected.add_argument("--foo_str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str)
+        expected.add_argument("--foo_float", nargs="+", default=[0.1, 0.2, 0.3], type=float)
+
+        self.argparsersEqual(parser, expected)
+
+        args = parser.parse_args([])
+        self.assertEqual(
+            args,
+            Namespace(foo_int=[], bar_int=[1, 2, 3], foo_str=["Hallo", "Bonjour", "Hello"], foo_float=[0.1, 0.2, 0.3]),
+        )
+
+        args = parser.parse_args("--foo_int 1 --bar_int 2 3 --foo_str a b c --foo_float 0.1 0.7".split())
+        self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
+
+    def test_with_optional(self):
+        parser = HfArgumentParser(OptionalExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--foo", default=None, type=int)
+        expected.add_argument("--bar", default=None, type=float, help="help message")
+        expected.add_argument("--baz", default=None, type=str)
+        expected.add_argument("--ces", nargs="+", default=[], type=str)
+        expected.add_argument("--des", nargs="+", default=[], type=int)
+        self.argparsersEqual(parser, expected)
+
+        args = parser.parse_args([])
+        self.assertEqual(args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[]))
+
+        args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split())
+        self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
+
+    def test_with_required(self):
+        parser = HfArgumentParser(RequiredExample)
+
+        expected = argparse.ArgumentParser()
+        expected.add_argument("--required_list", nargs="+", type=int, required=True)
+        expected.add_argument("--required_str", type=str, required=True)
+        expected.add_argument("--required_enum", type=str, choices=["titi", "toto"], required=True)
+        self.argparsersEqual(parser, expected)
+
+    def test_parse_dict(self):
+        parser = HfArgumentParser(BasicExample)
+
+        args_dict = {
+            "foo": 12,
+            "bar": 3.14,
+            "baz": "42",
+            "flag": True,
+        }
+
+        parsed_args = parser.parse_dict(args_dict)[0]
+        args = BasicExample(**args_dict)
+        self.assertEqual(parsed_args, args)
+
+    def test_integration_training_args(self):
+        parser = HfArgumentParser(TrainingArguments)
+        self.assertIsNotNone(parser)
diff --git a/test_image_utils.py b/test_image_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..584cf3f2518d2adb0f013bdaea1d800ded870a82
--- /dev/null
+++ b/test_image_utils.py
@@ -0,0 +1,369 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL.Image
+
+    from transformers import ImageFeatureExtractionMixin
+
+
+def get_random_image(height, width):
+    random_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+    return PIL.Image.fromarray(random_array)
+
+
+@require_vision
+class ImageFeatureExtractionTester(unittest.TestCase):
+    def test_conversion_image_to_array(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+
+        # Conversion with defaults (rescale + channel first)
+        array1 = feature_extractor.to_numpy_array(image)
+        self.assertTrue(array1.dtype, np.float32)
+        self.assertEqual(array1.shape, (3, 16, 32))
+
+        # Conversion with rescale and not channel first
+        array2 = feature_extractor.to_numpy_array(image, channel_first=False)
+        self.assertTrue(array2.dtype, np.float32)
+        self.assertEqual(array2.shape, (16, 32, 3))
+        self.assertTrue(np.array_equal(array1, array2.transpose(2, 0, 1)))
+
+        # Conversion with no rescale and channel first
+        array3 = feature_extractor.to_numpy_array(image, rescale=False)
+        self.assertTrue(array3.dtype, np.uint8)
+        self.assertEqual(array3.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array1, array3.astype(np.float32) / 255.0))
+
+        # Conversion with no rescale and not channel first
+        array4 = feature_extractor.to_numpy_array(image, rescale=False, channel_first=False)
+        self.assertTrue(array4.dtype, np.uint8)
+        self.assertEqual(array4.shape, (16, 32, 3))
+        self.assertTrue(np.array_equal(array2, array4.astype(np.float32) / 255.0))
+
+    def test_conversion_array_to_array(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        array = np.random.randint(0, 256, (16, 32, 3), dtype=np.uint8)
+
+        # By default, rescale (for an array of ints) and channel permute
+        array1 = feature_extractor.to_numpy_array(array)
+        self.assertTrue(array1.dtype, np.float32)
+        self.assertEqual(array1.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) / 255.0))
+
+        # Same with no permute
+        array2 = feature_extractor.to_numpy_array(array, channel_first=False)
+        self.assertTrue(array2.dtype, np.float32)
+        self.assertEqual(array2.shape, (16, 32, 3))
+        self.assertTrue(np.array_equal(array2, array.astype(np.float32) / 255.0))
+
+        # Force rescale to False
+        array3 = feature_extractor.to_numpy_array(array, rescale=False)
+        self.assertTrue(array3.dtype, np.uint8)
+        self.assertEqual(array3.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array3, array.transpose(2, 0, 1)))
+
+        # Force rescale to False and no channel permute
+        array4 = feature_extractor.to_numpy_array(array, rescale=False, channel_first=False)
+        self.assertTrue(array4.dtype, np.uint8)
+        self.assertEqual(array4.shape, (16, 32, 3))
+        self.assertTrue(np.array_equal(array4, array))
+
+        # Now test the default rescale for a float array (defaults to False)
+        array5 = feature_extractor.to_numpy_array(array2)
+        self.assertTrue(array5.dtype, np.float32)
+        self.assertEqual(array5.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array5, array1))
+
+    @require_torch
+    def test_conversion_torch_to_array(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        tensor = torch.randint(0, 256, (16, 32, 3))
+        array = tensor.numpy()
+
+        # By default, rescale (for a tensor of ints) and channel permute
+        array1 = feature_extractor.to_numpy_array(array)
+        self.assertTrue(array1.dtype, np.float32)
+        self.assertEqual(array1.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array1, array.transpose(2, 0, 1).astype(np.float32) / 255.0))
+
+        # Same with no permute
+        array2 = feature_extractor.to_numpy_array(array, channel_first=False)
+        self.assertTrue(array2.dtype, np.float32)
+        self.assertEqual(array2.shape, (16, 32, 3))
+        self.assertTrue(np.array_equal(array2, array.astype(np.float32) / 255.0))
+
+        # Force rescale to False
+        array3 = feature_extractor.to_numpy_array(array, rescale=False)
+        self.assertTrue(array3.dtype, np.uint8)
+        self.assertEqual(array3.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array3, array.transpose(2, 0, 1)))
+
+        # Force rescale to False and no channel permute
+        array4 = feature_extractor.to_numpy_array(array, rescale=False, channel_first=False)
+        self.assertTrue(array4.dtype, np.uint8)
+        self.assertEqual(array4.shape, (16, 32, 3))
+        self.assertTrue(np.array_equal(array4, array))
+
+        # Now test the default rescale for a float tensor (defaults to False)
+        array5 = feature_extractor.to_numpy_array(array2)
+        self.assertTrue(array5.dtype, np.float32)
+        self.assertEqual(array5.shape, (3, 16, 32))
+        self.assertTrue(np.array_equal(array5, array1))
+
+    def test_conversion_image_to_image(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+
+        # On an image, `to_pil_image1` is a noop.
+        image1 = feature_extractor.to_pil_image(image)
+        self.assertTrue(isinstance(image, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image), np.array(image1)))
+
+    def test_conversion_array_to_image(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        array = np.random.randint(0, 256, (16, 32, 3), dtype=np.uint8)
+
+        # By default, no rescale (for an array of ints)
+        image1 = feature_extractor.to_pil_image(array)
+        self.assertTrue(isinstance(image1, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image1), array))
+
+        # If the array is channel-first, proper reordering of the channels is done.
+        image2 = feature_extractor.to_pil_image(array.transpose(2, 0, 1))
+        self.assertTrue(isinstance(image2, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image2), array))
+
+        # If the array has floating type, it's rescaled by default.
+        image3 = feature_extractor.to_pil_image(array.astype(np.float32) / 255.0)
+        self.assertTrue(isinstance(image3, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image3), array))
+
+        # You can override the default to rescale.
+        image4 = feature_extractor.to_pil_image(array.astype(np.float32), rescale=False)
+        self.assertTrue(isinstance(image4, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image4), array))
+
+        # And with floats + channel first.
+        image5 = feature_extractor.to_pil_image(array.transpose(2, 0, 1).astype(np.float32) / 255.0)
+        self.assertTrue(isinstance(image5, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image5), array))
+
+    @require_torch
+    def test_conversion_tensor_to_image(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        tensor = torch.randint(0, 256, (16, 32, 3))
+        array = tensor.numpy()
+
+        # By default, no rescale (for a tensor of ints)
+        image1 = feature_extractor.to_pil_image(tensor)
+        self.assertTrue(isinstance(image1, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image1), array))
+
+        # If the tensor is channel-first, proper reordering of the channels is done.
+        image2 = feature_extractor.to_pil_image(tensor.permute(2, 0, 1))
+        self.assertTrue(isinstance(image2, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image2), array))
+
+        # If the tensor has floating type, it's rescaled by default.
+        image3 = feature_extractor.to_pil_image(tensor.float() / 255.0)
+        self.assertTrue(isinstance(image3, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image3), array))
+
+        # You can override the default to rescale.
+        image4 = feature_extractor.to_pil_image(tensor.float(), rescale=False)
+        self.assertTrue(isinstance(image4, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image4), array))
+
+        # And with floats + channel first.
+        image5 = feature_extractor.to_pil_image(tensor.permute(2, 0, 1).float() / 255.0)
+        self.assertTrue(isinstance(image5, PIL.Image.Image))
+        self.assertTrue(np.array_equal(np.array(image5), array))
+
+    def test_resize_image_and_array(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+        array = np.array(image)
+
+        # Size can be an int or a tuple of ints.
+        resized_image = feature_extractor.resize(image, 8)
+        self.assertTrue(isinstance(resized_image, PIL.Image.Image))
+        self.assertEqual(resized_image.size, (8, 8))
+
+        resized_image1 = feature_extractor.resize(image, (8, 16))
+        self.assertTrue(isinstance(resized_image1, PIL.Image.Image))
+        self.assertEqual(resized_image1.size, (8, 16))
+
+        # Passing and array converts it to a PIL Image.
+        resized_image2 = feature_extractor.resize(array, 8)
+        self.assertTrue(isinstance(resized_image2, PIL.Image.Image))
+        self.assertEqual(resized_image2.size, (8, 8))
+        self.assertTrue(np.array_equal(np.array(resized_image), np.array(resized_image2)))
+
+        resized_image3 = feature_extractor.resize(image, (8, 16))
+        self.assertTrue(isinstance(resized_image3, PIL.Image.Image))
+        self.assertEqual(resized_image3.size, (8, 16))
+        self.assertTrue(np.array_equal(np.array(resized_image1), np.array(resized_image3)))
+
+    @require_torch
+    def test_resize_tensor(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        tensor = torch.randint(0, 256, (16, 32, 3))
+        array = tensor.numpy()
+
+        # Size can be an int or a tuple of ints.
+        resized_image = feature_extractor.resize(tensor, 8)
+        self.assertTrue(isinstance(resized_image, PIL.Image.Image))
+        self.assertEqual(resized_image.size, (8, 8))
+
+        resized_image1 = feature_extractor.resize(tensor, (8, 16))
+        self.assertTrue(isinstance(resized_image1, PIL.Image.Image))
+        self.assertEqual(resized_image1.size, (8, 16))
+
+        # Check we get the same results as with NumPy arrays.
+        resized_image2 = feature_extractor.resize(array, 8)
+        self.assertTrue(np.array_equal(np.array(resized_image), np.array(resized_image2)))
+
+        resized_image3 = feature_extractor.resize(array, (8, 16))
+        self.assertTrue(np.array_equal(np.array(resized_image1), np.array(resized_image3)))
+
+    def test_normalize_image(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+        array = np.array(image)
+        mean = [0.1, 0.5, 0.9]
+        std = [0.2, 0.4, 0.6]
+
+        # PIL Image are converted to NumPy arrays for the normalization
+        normalized_image = feature_extractor.normalize(image, mean, std)
+        self.assertTrue(isinstance(normalized_image, np.ndarray))
+        self.assertEqual(normalized_image.shape, (3, 16, 32))
+
+        # During the conversion rescale and channel first will be applied.
+        expected = array.transpose(2, 0, 1).astype(np.float32) / 255.0
+        np_mean = np.array(mean).astype(np.float32)[:, None, None]
+        np_std = np.array(std).astype(np.float32)[:, None, None]
+        expected = (expected - np_mean) / np_std
+        self.assertTrue(np.array_equal(normalized_image, expected))
+
+    def test_normalize_array(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        array = np.random.random((16, 32, 3))
+        mean = [0.1, 0.5, 0.9]
+        std = [0.2, 0.4, 0.6]
+
+        # mean and std can be passed as lists or NumPy arrays.
+        expected = (array - np.array(mean)) / np.array(std)
+        normalized_array = feature_extractor.normalize(array, mean, std)
+        self.assertTrue(np.array_equal(normalized_array, expected))
+
+        normalized_array = feature_extractor.normalize(array, np.array(mean), np.array(std))
+        self.assertTrue(np.array_equal(normalized_array, expected))
+
+        # Normalize will detect automatically if channel first or channel last is used.
+        array = np.random.random((3, 16, 32))
+        expected = (array - np.array(mean)[:, None, None]) / np.array(std)[:, None, None]
+        normalized_array = feature_extractor.normalize(array, mean, std)
+        self.assertTrue(np.array_equal(normalized_array, expected))
+
+        normalized_array = feature_extractor.normalize(array, np.array(mean), np.array(std))
+        self.assertTrue(np.array_equal(normalized_array, expected))
+
+    @require_torch
+    def test_normalize_tensor(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        tensor = torch.rand(16, 32, 3)
+        mean = [0.1, 0.5, 0.9]
+        std = [0.2, 0.4, 0.6]
+
+        # mean and std can be passed as lists or tensors.
+        expected = (tensor - torch.tensor(mean)) / torch.tensor(std)
+        normalized_tensor = feature_extractor.normalize(tensor, mean, std)
+        self.assertTrue(torch.equal(normalized_tensor, expected))
+
+        normalized_tensor = feature_extractor.normalize(tensor, torch.tensor(mean), torch.tensor(std))
+        self.assertTrue(torch.equal(normalized_tensor, expected))
+
+        # Normalize will detect automatically if channel first or channel last is used.
+        tensor = torch.rand(3, 16, 32)
+        expected = (tensor - torch.tensor(mean)[:, None, None]) / torch.tensor(std)[:, None, None]
+        normalized_tensor = feature_extractor.normalize(tensor, mean, std)
+        self.assertTrue(torch.equal(normalized_tensor, expected))
+
+        normalized_tensor = feature_extractor.normalize(tensor, torch.tensor(mean), torch.tensor(std))
+        self.assertTrue(torch.equal(normalized_tensor, expected))
+
+    def test_center_crop_image(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+
+        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
+        crop_sizes = [8, (8, 64), 20, (32, 64)]
+        for size in crop_sizes:
+            cropped_image = feature_extractor.center_crop(image, size)
+            self.assertTrue(isinstance(cropped_image, PIL.Image.Image))
+
+            # PIL Image.size is transposed compared to NumPy or PyTorch (width first instead of height first).
+            expected_size = (size, size) if isinstance(size, int) else (size[1], size[0])
+            self.assertEqual(cropped_image.size, expected_size)
+
+    def test_center_crop_array(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+        array = feature_extractor.to_numpy_array(image)
+
+        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
+        crop_sizes = [8, (8, 64), 20, (32, 64)]
+        for size in crop_sizes:
+            cropped_array = feature_extractor.center_crop(array, size)
+            self.assertTrue(isinstance(cropped_array, np.ndarray))
+
+            expected_size = (size, size) if isinstance(size, int) else size
+            self.assertEqual(cropped_array.shape[-2:], expected_size)
+
+            # Check result is consistent with PIL.Image.crop
+            cropped_image = feature_extractor.center_crop(image, size)
+            self.assertTrue(np.array_equal(cropped_array, feature_extractor.to_numpy_array(cropped_image)))
+
+    @require_torch
+    def test_center_crop_tensor(self):
+        feature_extractor = ImageFeatureExtractionMixin()
+        image = get_random_image(16, 32)
+        array = feature_extractor.to_numpy_array(image)
+        tensor = torch.tensor(array)
+
+        # Test various crop sizes: bigger on all dimensions, on one of the dimensions only and on both dimensions.
+        crop_sizes = [8, (8, 64), 20, (32, 64)]
+        for size in crop_sizes:
+            cropped_tensor = feature_extractor.center_crop(tensor, size)
+            self.assertTrue(isinstance(cropped_tensor, torch.Tensor))
+
+            expected_size = (size, size) if isinstance(size, int) else size
+            self.assertEqual(cropped_tensor.shape[-2:], expected_size)
+
+            # Check result is consistent with PIL.Image.crop
+            cropped_image = feature_extractor.center_crop(image, size)
+            self.assertTrue(torch.equal(cropped_tensor, torch.tensor(feature_extractor.to_numpy_array(cropped_image))))
diff --git a/test_logging.py b/test_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0633bfbe41717bf7ee9fdc6ec398757300736c5
--- /dev/null
+++ b/test_logging.py
@@ -0,0 +1,105 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import transformers.models.bart.tokenization_bart
+from transformers import logging
+from transformers.testing_utils import CaptureLogger, mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    def test_set_level(self):
+        logger = logging.get_logger()
+
+        # the current default level is logging.WARNING
+        level_origin = logging.get_verbosity()
+
+        logging.set_verbosity_error()
+        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
+
+        logging.set_verbosity_warning()
+        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
+
+        logging.set_verbosity_info()
+        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
+
+        logging.set_verbosity_debug()
+        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())
+
+        # restore to the original level
+        logging.set_verbosity(level_origin)
+
+    def test_integration(self):
+        level_origin = logging.get_verbosity()
+
+        logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+        msg = "Testing 1, 2, 3"
+
+        # should be able to log warnings (if default settings weren't overridden by `pytest --log-level-all`)
+        if level_origin <= logging.WARNING:
+            with CaptureLogger(logger) as cl:
+                logger.warning(msg)
+            self.assertEqual(cl.out, msg + "\n")
+
+        # this is setting the level for all of `transformers.*` loggers
+        logging.set_verbosity_error()
+
+        # should not be able to log warnings
+        with CaptureLogger(logger) as cl:
+            logger.warning(msg)
+        self.assertEqual(cl.out, "")
+
+        # should be able to log warnings again
+        logging.set_verbosity_warning()
+        with CaptureLogger(logger) as cl:
+            logger.warning(msg)
+        self.assertEqual(cl.out, msg + "\n")
+
+        # restore to the original level
+        logging.set_verbosity(level_origin)
+
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        # reset for the env var to take effect, next time some logger call is made
+        transformers.utils.logging._reset_library_root_logger()
+        # this action activates the env var
+        _ = logging.get_logger("transformers.models.bart.tokenization_bart")
+
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+        env_level = logging.log_levels[env_level_str]
+
+        current_level = logging.get_verbosity()
+        self.assertEqual(
+            env_level,
+            current_level,
+            f"TRANSFORMERS_VERBOSITY={env_level_str}/{env_level}, but internal verbosity is {current_level}",
+        )
+
+        # restore to the original level
+        os.environ["TRANSFORMERS_VERBOSITY"] = ""
+        transformers.utils.logging._reset_library_root_logger()
+
+    @mockenv(TRANSFORMERS_VERBOSITY="super-error")
+    def test_env_invalid_override(self):
+        # reset for the env var to take effect, next time some logger call is made
+        transformers.utils.logging._reset_library_root_logger()
+        logger = logging.logging.getLogger()
+        with CaptureLogger(logger) as cl:
+            # this action activates the env var
+            logging.get_logger("transformers.models.bart.tokenization_bart")
+        self.assertIn("Unknown option TRANSFORMERS_VERBOSITY=super-error", cl.out)
+
+        # no need to restore as nothing was changed
diff --git a/test_model_card.py b/test_model_card.py
new file mode 100644
index 0000000000000000000000000000000000000000..1004642a92a2a6253da5cc91a05ac5c3545ffed9
--- /dev/null
+++ b/test_model_card.py
@@ -0,0 +1,81 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import tempfile
+import unittest
+
+from transformers.modelcard import ModelCard
+
+
+class ModelCardTester(unittest.TestCase):
+    def setUp(self):
+        self.inputs_dict = {
+            "model_details": {
+                "Organization": "testing",
+                "Model date": "today",
+                "Model version": "v2.1, Developed by Test Corp in 2019.",
+                "Architecture": "Convolutional Neural Network.",
+            },
+            "metrics": "BLEU and ROUGE-1",
+            "evaluation_data": {
+                "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1"},
+                "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf",
+            },
+            "training_data": {
+                "Dataset": "English Wikipedia dump dated 2018-12-01",
+                "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf",
+            },
+            "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76},
+        }
+
+    def test_model_card_common_properties(self):
+        modelcard = ModelCard.from_dict(self.inputs_dict)
+        self.assertTrue(hasattr(modelcard, "model_details"))
+        self.assertTrue(hasattr(modelcard, "intended_use"))
+        self.assertTrue(hasattr(modelcard, "factors"))
+        self.assertTrue(hasattr(modelcard, "metrics"))
+        self.assertTrue(hasattr(modelcard, "evaluation_data"))
+        self.assertTrue(hasattr(modelcard, "training_data"))
+        self.assertTrue(hasattr(modelcard, "quantitative_analyses"))
+        self.assertTrue(hasattr(modelcard, "ethical_considerations"))
+        self.assertTrue(hasattr(modelcard, "caveats_and_recommendations"))
+
+    def test_model_card_to_json_string(self):
+        modelcard = ModelCard.from_dict(self.inputs_dict)
+        obj = json.loads(modelcard.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_model_card_to_json_file(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            filename = os.path.join(tmpdirname, "modelcard.json")
+            model_card_first.to_json_file(filename)
+            model_card_second = ModelCard.from_json_file(filename)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+    def test_model_card_from_and_save_pretrained(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model_card_first.save_pretrained(tmpdirname)
+            model_card_second = ModelCard.from_pretrained(tmpdirname)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
diff --git a/test_model_output.py b/test_model_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5160566e64a4fdf3437bbf7a7615dcf658830c7
--- /dev/null
+++ b/test_model_output.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2020 The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from dataclasses import dataclass
+from typing import Optional
+
+from transformers.file_utils import ModelOutput
+
+
+@dataclass
+class ModelOutputTest(ModelOutput):
+    a: float
+    b: Optional[float] = None
+    c: Optional[float] = None
+
+
+class ModelOutputTester(unittest.TestCase):
+    def test_get_attributes(self):
+        x = ModelOutputTest(a=30)
+        self.assertEqual(x.a, 30)
+        self.assertIsNone(x.b)
+        self.assertIsNone(x.c)
+        with self.assertRaises(AttributeError):
+            _ = x.d
+
+    def test_index_with_ints_and_slices(self):
+        x = ModelOutputTest(a=30, b=10)
+        self.assertEqual(x[0], 30)
+        self.assertEqual(x[1], 10)
+        self.assertEqual(x[:2], (30, 10))
+        self.assertEqual(x[:], (30, 10))
+
+        x = ModelOutputTest(a=30, c=10)
+        self.assertEqual(x[0], 30)
+        self.assertEqual(x[1], 10)
+        self.assertEqual(x[:2], (30, 10))
+        self.assertEqual(x[:], (30, 10))
+
+    def test_index_with_strings(self):
+        x = ModelOutputTest(a=30, b=10)
+        self.assertEqual(x["a"], 30)
+        self.assertEqual(x["b"], 10)
+        with self.assertRaises(KeyError):
+            _ = x["c"]
+
+        x = ModelOutputTest(a=30, c=10)
+        self.assertEqual(x["a"], 30)
+        self.assertEqual(x["c"], 10)
+        with self.assertRaises(KeyError):
+            _ = x["b"]
+
+    def test_dict_like_properties(self):
+        x = ModelOutputTest(a=30)
+        self.assertEqual(list(x.keys()), ["a"])
+        self.assertEqual(list(x.values()), [30])
+        self.assertEqual(list(x.items()), [("a", 30)])
+        self.assertEqual(list(x), ["a"])
+
+        x = ModelOutputTest(a=30, b=10)
+        self.assertEqual(list(x.keys()), ["a", "b"])
+        self.assertEqual(list(x.values()), [30, 10])
+        self.assertEqual(list(x.items()), [("a", 30), ("b", 10)])
+        self.assertEqual(list(x), ["a", "b"])
+
+        x = ModelOutputTest(a=30, c=10)
+        self.assertEqual(list(x.keys()), ["a", "c"])
+        self.assertEqual(list(x.values()), [30, 10])
+        self.assertEqual(list(x.items()), [("a", 30), ("c", 10)])
+        self.assertEqual(list(x), ["a", "c"])
+
+        with self.assertRaises(Exception):
+            x = x.update({"d": 20})
+        with self.assertRaises(Exception):
+            del x["a"]
+        with self.assertRaises(Exception):
+            _ = x.pop("a")
+        with self.assertRaises(Exception):
+            _ = x.setdefault("d", 32)
+
+    def test_set_attributes(self):
+        x = ModelOutputTest(a=30)
+        x.a = 10
+        self.assertEqual(x.a, 10)
+        self.assertEqual(x["a"], 10)
+
+    def test_set_keys(self):
+        x = ModelOutputTest(a=30)
+        x["a"] = 10
+        self.assertEqual(x.a, 10)
+        self.assertEqual(x["a"], 10)
diff --git a/test_modeling_albert.py b/test_modeling_albert.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e60d6925bb323f62dc144e76db177a6e97f264
--- /dev/null
+++ b/test_modeling_albert.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        AlbertConfig,
+        AlbertForMaskedLM,
+        AlbertForMultipleChoice,
+        AlbertForPreTraining,
+        AlbertForQuestionAnswering,
+        AlbertForSequenceClassification,
+        AlbertForTokenClassification,
+        AlbertModel,
+    )
+    from transformers.models.albert.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class AlbertModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.embedding_size = 16
+        self.hidden_size = 36
+        self.num_hidden_layers = 6
+        self.num_hidden_groups = 6
+        self.num_attention_heads = 6
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = AlbertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            num_hidden_groups=self.num_hidden_groups,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            sentence_order_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = AlbertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = AlbertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = AlbertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = AlbertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            AlbertModel,
+            AlbertForPreTraining,
+            AlbertForMaskedLM,
+            AlbertForMultipleChoice,
+            AlbertForSequenceClassification,
+            AlbertForTokenClassification,
+            AlbertForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_ready_model_classes = all_model_classes
+
+    test_sequence_classification_problem_types = True
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["sentence_order_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = AlbertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = AlbertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class AlbertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = AlbertModel.from_pretrained("albert-base-v2")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/test_modeling_auto.py b/test_modeling_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ba839c42ade8086176e43821c5ef37e6c1939b5
--- /dev/null
+++ b/test_modeling_auto.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    DUMMY_UNKWOWN_IDENTIFIER,
+    SMALL_MODEL_IDENTIFIER,
+    require_scatter,
+    require_torch,
+    slow,
+)
+
+
+if is_torch_available():
+    from transformers import (
+        AutoConfig,
+        AutoModel,
+        AutoModelForCausalLM,
+        AutoModelForMaskedLM,
+        AutoModelForPreTraining,
+        AutoModelForQuestionAnswering,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelForTableQuestionAnswering,
+        AutoModelForTokenClassification,
+        AutoModelWithLMHead,
+        BertConfig,
+        BertForMaskedLM,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertModel,
+        FunnelBaseModel,
+        FunnelModel,
+        GPT2Config,
+        GPT2LMHeadModel,
+        RobertaForMaskedLM,
+        T5Config,
+        T5ForConditionalGeneration,
+        TapasConfig,
+        TapasForQuestionAnswering,
+    )
+    from transformers.models.auto.modeling_auto import (
+        MODEL_FOR_CAUSAL_LM_MAPPING,
+        MODEL_FOR_MASKED_LM_MAPPING,
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_MAPPING,
+        MODEL_WITH_LM_HEAD_MAPPING,
+    )
+    from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.gpt2.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.t5.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.tapas.modeling_tapas import TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+@require_torch
+class AutoModelTest(unittest.TestCase):
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModel.from_pretrained(model_name)
+            model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+    @slow
+    def test_model_for_pretraining_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForPreTraining.from_pretrained(model_name)
+            model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForPreTraining)
+            # Only one value should not be initialized and in the missing keys.
+            missing_keys = loading_info.pop("missing_keys")
+            self.assertListEqual(["cls.predictions.decoder.bias"], missing_keys)
+            for key, value in loading_info.items():
+                self.assertEqual(len(value), 0)
+
+    @slow
+    def test_lmhead_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    @slow
+    def test_model_for_causal_lm(self):
+        for model_name in GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, GPT2Config)
+
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            model, loading_info = AutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, GPT2LMHeadModel)
+
+    @slow
+    def test_model_for_masked_lm(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForMaskedLM.from_pretrained(model_name)
+            model, loading_info = AutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    @slow
+    def test_model_for_encoder_decoder_lm(self):
+        for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, T5Config)
+
+            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            model, loading_info = AutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, T5ForConditionalGeneration)
+
+    @slow
+    def test_sequence_classification_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(
+                model_name, output_loading_info=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+
+    @slow
+    def test_question_answering_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
+
+    @slow
+    @require_scatter
+    def test_table_question_answering_model_from_pretrained(self):
+        for model_name in TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST[5:6]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, TapasConfig)
+
+            model = AutoModelForTableQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForTableQuestionAnswering.from_pretrained(
+                model_name, output_loading_info=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TapasForQuestionAnswering)
+
+    @slow
+    def test_token_classification_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForTokenClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForTokenClassification.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForTokenClassification)
+
+    def test_from_pretrained_identifier(self):
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+    def test_from_identifier_from_model_type(self):
+        model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
+        self.assertIsInstance(model, RobertaForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, FunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = AutoModel.from_config(config)
+        self.assertIsInstance(model, FunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = AutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, FunnelBaseModel)
+
+    def test_parents_and_children_in_mappings(self):
+        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
+        # by the parents and will return the wrong configuration type when using auto models
+
+        mappings = (
+            MODEL_MAPPING,
+            MODEL_FOR_PRETRAINING_MAPPING,
+            MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_WITH_LM_HEAD_MAPPING,
+            MODEL_FOR_CAUSAL_LM_MAPPING,
+            MODEL_FOR_MASKED_LM_MAPPING,
+            MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        )
+
+        for mapping in mappings:
+            mapping = tuple(mapping.items())
+            for index, (child_config, child_model) in enumerate(mapping[1:]):
+                for parent_config, parent_model in mapping[: index + 1]:
+                    assert not issubclass(
+                        child_config, parent_config
+                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
diff --git a/test_modeling_bart.py b/test_modeling_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..20f33f0ddaa6167b3d70f2290e8ac423479b1e26
--- /dev/null
+++ b/test_modeling_bart.py
@@ -0,0 +1,962 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BART model. """
+
+
+import copy
+import tempfile
+import unittest
+
+import timeout_decorator  # noqa
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoModelForSequenceClassification,
+        BartConfig,
+        BartForCausalLM,
+        BartForConditionalGeneration,
+        BartForQuestionAnswering,
+        BartForSequenceClassification,
+        BartModel,
+        BartTokenizer,
+        pipeline,
+    )
+    from transformers.models.bart.modeling_bart import BartDecoder, BartEncoder, shift_tokens_right
+
+
+def prepare_bart_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class BartModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BartModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = BartModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = BartEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = BartDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class BartHeadTests(unittest.TestCase):
+    vocab_size = 99
+
+    def _get_config_and_data(self):
+        input_ids = torch.tensor(
+            [
+                [71, 82, 18, 33, 46, 91, 2],
+                [68, 34, 26, 58, 30, 82, 2],
+                [5, 97, 17, 39, 94, 40, 2],
+                [76, 83, 94, 25, 70, 78, 2],
+                [87, 59, 41, 35, 48, 66, 2],
+                [55, 13, 16, 58, 5, 2, 1],  # note padding
+                [64, 27, 31, 51, 12, 75, 2],
+                [52, 64, 86, 17, 83, 39, 2],
+                [48, 61, 9, 24, 71, 82, 2],
+                [26, 1, 60, 48, 22, 13, 2],
+                [21, 5, 62, 28, 14, 76, 2],
+                [45, 98, 37, 86, 59, 48, 2],
+                [70, 70, 50, 9, 28, 0, 2],
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+
+        batch_size = input_ids.shape[0]
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+        )
+        return config, input_ids, batch_size
+
+    def test_sequence_classification_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        labels = _long_tensor([2] * batch_size).to(torch_device)
+        model = BartForSequenceClassification(config)
+        model.to(torch_device)
+        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
+        expected_shape = torch.Size((batch_size, config.num_labels))
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+        self.assertIsInstance(outputs["loss"].item(), float)
+
+    def test_question_answering_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        sequence_labels = ids_tensor([batch_size], 2).to(torch_device)
+        model = BartForQuestionAnswering(config)
+        model.to(torch_device)
+        outputs = model(
+            input_ids=input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+
+        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
+        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
+        self.assertIsInstance(outputs["loss"].item(), float)
+
+    @timeout_decorator.timeout(1)
+    def test_lm_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size).to(torch_device)
+        lm_model = BartForConditionalGeneration(config)
+        lm_model.to(torch_device)
+        outputs = lm_model(input_ids=input_ids, labels=lm_labels)
+        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+        self.assertIsInstance(outputs["loss"].item(), float)
+
+    def test_lm_uneven_forward(self):
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=14,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=8,
+            decoder_ffn_dim=8,
+            max_position_embeddings=48,
+        )
+        lm_model = BartForConditionalGeneration(config).to(torch_device)
+        context = torch.tensor(
+            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], device=torch_device, dtype=torch.long
+        )
+        summary = torch.tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], device=torch_device, dtype=torch.long)
+        outputs = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
+        expected_shape = (*summary.shape, config.vocab_size)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+
+    def test_generate_beam_search(self):
+        input_ids = torch.tensor([[71, 82, 2], [68, 34, 2]], device=torch_device, dtype=torch.long)
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+        )
+        lm_model = BartForConditionalGeneration(config).to(torch_device)
+        lm_model.eval()
+
+        max_length = 5
+        generated_ids = lm_model.generate(
+            input_ids.clone(),
+            do_sample=True,
+            num_return_sequences=1,
+            num_beams=2,
+            no_repeat_ngram_size=3,
+            max_length=max_length,
+        )
+        self.assertEqual(generated_ids.shape, (input_ids.shape[0], max_length))
+
+    def test_shift_tokens_right(self):
+        input_ids = torch.tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=torch.long)
+        shifted = shift_tokens_right(input_ids, 1, 2)
+        n_pad_before = input_ids.eq(1).float().sum()
+        n_pad_after = shifted.eq(1).float().sum()
+        self.assertEqual(shifted.shape, input_ids.shape)
+        self.assertEqual(n_pad_after, n_pad_before - 1)
+        self.assertTrue(torch.eq(shifted[:, 0], 2).all())
+
+    @slow
+    def test_tokenization(self):
+        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+        examples = [" Hello world", " DomDramg"]  # need leading spaces for equality
+        fairseq_results = [
+            torch.tensor([0, 20920, 232, 2]),
+            torch.tensor([0, 11349, 495, 4040, 571, 2]),
+        ]
+        for ex, desired_result in zip(examples, fairseq_results):
+            bart_toks = tokenizer.encode(ex, return_tensors="pt").squeeze()
+            assert_tensors_close(desired_result.long(), bart_toks, prefix=ex)
+
+    def test_generate_fp16(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_dummy_inputs(self):
+        config, *_ = self._get_config_and_data()
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        model(**model.dummy_inputs)
+
+    def test_resize_tokens_embeddings_more(self):
+        config, input_ids, _ = self._get_config_and_data()
+
+        def _get_embs(m):
+            return (m.get_input_embeddings().weight.data.clone(), m.get_output_embeddings().weight.data.clone())
+
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        input, output = _get_embs(model)
+        self.assertTrue(torch.eq(input, output).all())
+        new_vocab_size = 45
+        model.resize_token_embeddings(new_vocab_size)
+        input_new, output_new = _get_embs(model)
+        self.assertEqual(input_new.shape, (new_vocab_size, config.d_model))
+        self.assertEqual(output_new.shape, (new_vocab_size, config.d_model))
+        self.assertTrue(torch.eq(input_new, output_new).all())
+
+
+@require_torch
+class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (BartForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = BartModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # BartForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (BartModel, BartForConditionalGeneration, BartForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+@require_torch
+@slow
+class FastIntegrationTests(unittest.TestCase):
+    """These tests are useful for debugging since they operate on a model with 1 encoder layer and 1 decoder layer."""
+
+    @cached_property
+    def tok(self):
+        return BartTokenizer.from_pretrained("facebook/bart-large")
+
+    @cached_property
+    def xsum_1_1_model(self):
+        return BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-1-1")
+
+    def test_xsum_1_1_generation(self):
+        hf = self.xsum_1_1_model
+        tok = self.tok
+        ARTICLE = 'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.'
+        EXPECTED = " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
+
+        dct = tok(ARTICLE, return_tensors="pt")
+        generated_ids = hf.generate(**dct, num_beams=4)
+        result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        assert EXPECTED == result
+
+    def test_xsum_1_1_batch_generation(self):
+        # test batch
+
+        batch = self.tok(
+            [
+                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
+                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
+            ],
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+        )
+        generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
+        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
+        assert (
+            result[0]
+            == " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
+        )
+        assert (
+            result[1]
+            == " An investigation into the crash that killed at least 10 people in the French capital has been released by the French police investigating the crash."
+        )
+
+    def test_encoder_equiv(self):
+        # test batch
+
+        batch = self.tok(
+            [
+                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
+                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
+            ],
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+        )
+        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
+        expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]
+        assert_tensors_close(features[0, :3, :3], torch.tensor(expected), atol=1e-3)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class BartModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_tokenizer(self):
+        return BartTokenizer.from_pretrained("facebook/bart-large")
+
+    @slow
+    def test_inference_no_head(self):
+        model = BartModel.from_pretrained("facebook/bart-large").to(torch_device)
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        attention_mask = input_ids.ne(model.config.pad_token_id)
+        with torch.no_grad():
+            output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
+        expected_shape = torch.Size((1, 11, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
+
+    @slow
+    def test_base_mask_filling(self):
+        pbase = pipeline(task="fill-mask", model="facebook/bart-base")
+        src_text = [" I went to the <mask>."]
+        results = [x["token_str"] for x in pbase(src_text)]
+        assert " bathroom" in results
+
+    @slow
+    def test_large_mask_filling(self):
+        plarge = pipeline(task="fill-mask", model="facebook/bart-large")
+        src_text = [" I went to the <mask>."]
+        results = [x["token_str"] for x in plarge(src_text)]
+        expected_results = [" bathroom", " gym", " wrong", " movies", " hospital"]
+        self.assertListEqual(results, expected_results)
+
+    @slow
+    def test_mnli_inference(self):
+        example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
+
+        model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli").to(
+            torch_device
+        )  # eval called in from_pre
+        attention_mask = input_ids.ne(model.config.pad_token_id)
+        # Test that model hasn't changed
+        with torch.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+
+        batched_logits = outputs.logits
+        expected_shape = torch.Size((2, 3))
+        self.assertEqual(batched_logits.shape, expected_shape)
+        expected_slice = torch.tensor([[0.1907, 1.4342, -1.0289]], device=torch_device)
+        logits_arr = batched_logits[0].detach()
+
+        # Test that padding does not change results
+        input_ids_no_pad = _long_tensor([example_b[:-1]])
+        attention_mask_no_pad = input_ids_no_pad.ne(model.config.pad_token_id)
+
+        with torch.no_grad():
+            logits2 = model(input_ids=input_ids_no_pad, attention_mask=attention_mask_no_pad).logits.squeeze()
+        assert_tensors_close(batched_logits[1], logits2, atol=1e-3)
+        assert_tensors_close(expected_slice, logits_arr, atol=1e-3)
+
+    @slow
+    def test_xsum_summarization_same_as_fairseq(self):
+        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum").to(torch_device)
+        tok = self.default_tokenizer
+
+        PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+
+        EXPECTED_SUMMARY = "California's largest power company has begun shutting off electricity to thousands of customers in the state."
+        dct = tok.batch_encode_plus(
+            [PGE_ARTICLE],
+            max_length=1024,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        hypotheses_batch = model.generate(
+            input_ids=dct["input_ids"],
+            attention_mask=dct["attention_mask"],
+            num_beams=2,
+            max_length=62,
+            min_length=11,
+            length_penalty=1.0,
+            no_repeat_ngram_size=3,
+            early_stopping=True,
+            decoder_start_token_id=model.config.eos_token_id,
+        )
+
+        decoded = tok.batch_decode(
+            hypotheses_batch,
+            skip_special_tokens=True,
+        )
+        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
+
+    def test_xsum_config_generation_params(self):
+        config = BartConfig.from_pretrained("facebook/bart-large-xsum")
+        expected_params = dict(num_beams=6, do_sample=False, early_stopping=True, length_penalty=1.0)
+        config_params = {k: getattr(config, k, "MISSING") for k, v in expected_params.items()}
+        self.assertDictEqual(expected_params, config_params)
+
+    @slow
+    def test_cnn_summarization_same_as_fairseq(self):
+        hf = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(torch_device)
+        tok = BartTokenizer.from_pretrained("facebook/bart-large")
+
+        FRANCE_ARTICLE = ' Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noq
+
+        SHORTER_ARTICLE = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        IRAN_ARTICLE = " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+
+        ARTICLE_SUBWAY = ' New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+
+        dct = tok.batch_encode_plus(
+            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
+            max_length=1024,
+            padding="max_length",
+            truncation_strategy="only_first",
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        self.assertEqual(1024, dct["input_ids"].shape[1])
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=2,
+        )
+        assert hypotheses_batch[:, 1].eq(0).all().item()
+
+        EXPECTED = [
+            "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
+            "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
+            "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
+            "9525 were killed.",
+            "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
+            "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
+            "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
+            "move toward greater justice.",
+            "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
+            "debate that has already begun will likely result in more heat than light. He says critics have made "
+            "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
+            "nuclear weapon.",
+            "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
+            "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
+            "Bronx on Friday. If convicted, she faces up to four years in prison.",
+        ]
+
+        generated_summaries = tok.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated_summaries == EXPECTED
+
+
+class BartStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            encoder_layers=self.decoder_layers,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.decoder_seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = BartDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = BartDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BartDecoder, BartForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (BartForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = BartStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=BartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_bert.py b/test_modeling_bert.py
new file mode 100755
index 0000000000000000000000000000000000000000..c87c97a543f90dfc26b912f840c56069d8dbfeef
--- /dev/null
+++ b/test_modeling_bert.py
@@ -0,0 +1,595 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        BertConfig,
+        BertForMaskedLM,
+        BertForMultipleChoice,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertLMHeadModel,
+        BertModel,
+    )
+    from transformers.models.bert.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class BertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = BertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = BertModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = BertLMHeadModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_model_for_causal_lm_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = BertLMHeadModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = BertLMHeadModel(config=config).to(torch_device).eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BertForNextSentencePrediction(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=sequence_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+    def create_and_check_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            next_sentence_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = BertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = BertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = BertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            BertModel,
+            BertLMHeadModel,
+            BertForMaskedLM,
+            BertForMultipleChoice,
+            BertForNextSentencePrediction,
+            BertForPreTraining,
+            BertForQuestionAnswering,
+            BertForSequenceClassification,
+            BertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (BertLMHeadModel,) if is_torch_available() else ()
+    fx_ready_model_classes = all_model_classes
+    test_sequence_classification_problem_types = True
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = BertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_causal_lm_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = BertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class BertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = BertModel.from_pretrained("bert-base-uncased")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor([[[0.4249, 0.1008, 0.7531], [0.3771, 0.1188, 0.7467], [0.4152, 0.1098, 0.7108]]])
+
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head_relative_embedding_key(self):
+        model = BertModel.from_pretrained("zhiheng-huang/bert-base-uncased-embedding-relative-key")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[0.0756, 0.3142, -0.5128], [0.3761, 0.3462, -0.5477], [0.2052, 0.3760, -0.1240]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head_relative_embedding_key_query(self):
+        model = BertModel.from_pretrained("zhiheng-huang/bert-base-uncased-embedding-relative-key-query")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[0.6496, 0.3784, 0.8203], [0.8148, 0.5656, 0.2636], [-0.0681, 0.5597, 0.7045]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/test_modeling_bert_generation.py b/test_modeling_bert_generation.py
new file mode 100755
index 0000000000000000000000000000000000000000..0ca0d81f4067b000f55911f9c27066b6b0688311
--- /dev/null
+++ b/test_modeling_bert_generation.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import BertGenerationConfig, BertGenerationDecoder, BertGenerationEncoder
+
+
+class BertGenerationEncoderTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=50,
+        initializer_range=0.02,
+        use_labels=True,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.use_labels = use_labels
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if self.use_labels:
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BertGenerationConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, input_mask, token_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            token_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            token_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_labels,
+        **kwargs,
+    ):
+        model = BertGenerationEncoder(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        **kwargs,
+    ):
+        config.add_cross_attention = True
+        model = BertGenerationEncoder(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        **kwargs,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = BertGenerationDecoder(config=config).to(torch_device).eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_labels,
+        *args,
+    ):
+        model = BertGenerationDecoder(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs()
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_torch_available() else ()
+    all_generative_model_classes = (BertGenerationDecoder,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = BertGenerationEncoderTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertGenerationConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_bert(self):
+        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
+        config.model_type = "bert"
+        self.model_tester.create_and_check_model(config, input_ids, input_mask, token_labels)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            input_mask,
+            token_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            input_mask,
+            token_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class BertGenerationEncoderIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
+        input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size([1, 8, 1024])
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]]
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class BertGenerationDecoderIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
+        input_ids = torch.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size([1, 8, 50358])
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]]
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_big_bird.py b/test_modeling_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7d12fe2d336bd54e0d51d297bad59e50a74d8e
--- /dev/null
+++ b/test_modeling_big_bird.py
@@ -0,0 +1,904 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BigBird model. """
+
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        BigBirdConfig,
+        BigBirdForCausalLM,
+        BigBirdForMaskedLM,
+        BigBirdForMultipleChoice,
+        BigBirdForPreTraining,
+        BigBirdForQuestionAnswering,
+        BigBirdForSequenceClassification,
+        BigBirdForTokenClassification,
+        BigBirdModel,
+    )
+    from transformers.models.big_bird.modeling_big_bird import BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class BigBirdModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        seq_length=128,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=256,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        attention_type="block_sparse",
+        use_bias=True,
+        rescale_embeddings=False,
+        block_size=16,
+        num_rand_blocks=3,
+        position_embedding_type="absolute",
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+        self.attention_type = attention_type
+        self.use_bias = use_bias
+        self.rescale_embeddings = rescale_embeddings
+        self.block_size = block_size
+        self.num_rand_blocks = num_rand_blocks
+        self.position_embedding_type = position_embedding_type
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = BigBirdConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_encoder_decoder=False,
+            initializer_range=self.initializer_range,
+            attention_type=self.attention_type,
+            use_bias=self.use_bias,
+            rescale_embeddings=self.rescale_embeddings,
+            block_size=self.block_size,
+            num_random_blocks=self.num_rand_blocks,
+            position_embedding_type=self.position_embedding_type,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BigBirdModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BigBirdForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            next_sentence_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, config.num_labels))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = BigBirdModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = BigBirdForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BigBirdForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = BigBirdForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = BigBirdForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = BigBirdForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = BigBirdForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = BigBirdForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+    def create_and_check_for_auto_padding(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = BigBirdModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_change_to_full_attn(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = BigBirdModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        # the config should not be changed
+        self.parent.assertTrue(model.config.attention_type == "block_sparse")
+
+
+@require_torch
+class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
+
+    # head masking & pruning is currently not supported for big bird
+    test_head_masking = False
+    test_pruning = False
+    test_sequence_classification_problem_types = True
+
+    # torchscript should be possible, but takes prohibitively long to test.
+    # Also torchscript is not an important feature to have in the beginning.
+    test_torchscript = False
+
+    all_model_classes = (
+        (
+            BigBirdModel,
+            BigBirdForPreTraining,
+            BigBirdForMaskedLM,
+            BigBirdForCausalLM,
+            BigBirdForMultipleChoice,
+            BigBirdForQuestionAnswering,
+            BigBirdForSequenceClassification,
+            BigBirdForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (BigBirdForCausalLM,) if is_torch_available() else ()
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = BigBirdModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BigBirdConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # bigbird cannot keep gradients in attentions when `attention_type=block_sparse`
+
+        if self.model_tester.attention_type == "original_full":
+            super().test_retain_grad_hidden_states_attentions()
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = BigBirdForPreTraining.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_model_various_attn_type(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["original_full", "block_sparse"]:
+            config_and_inputs[0].attention_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_fast_integration(self):
+        # fmt: off
+        input_ids = torch.tensor(
+            [[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 122, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 44, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 98, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73],[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 12, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 28, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 18, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73]],  # noqa: E231
+            dtype=torch.long,
+            device=torch_device,
+        )
+        # fmt: on
+        input_ids = input_ids % self.model_tester.vocab_size
+        input_ids[1] = input_ids[1] - 1
+
+        attention_mask = torch.ones((input_ids.shape), device=torch_device)
+        attention_mask[:, :-10] = 0
+
+        config, _, _, _, _, _, _ = self.model_tester.prepare_config_and_inputs()
+        torch.manual_seed(0)
+        model = BigBirdModel(config).eval().to(torch_device)
+
+        with torch.no_grad():
+            hidden_states = model(input_ids, attention_mask=attention_mask).last_hidden_state
+            self.assertTrue(
+                torch.allclose(
+                    hidden_states[0, 0, :5],
+                    torch.tensor([1.4943, 0.0928, 0.8254, -0.2816, -0.9788], device=torch_device),
+                    atol=1e-3,
+                )
+            )
+
+    def test_auto_padding(self):
+        self.model_tester.seq_length = 241
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_auto_padding(*config_and_inputs)
+
+    def test_for_change_to_full_attn(self):
+        self.model_tester.seq_length = 9
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs)
+
+
+@require_torch
+@slow
+class BigBirdModelIntegrationTest(unittest.TestCase):
+    # we can have this true once block_sparse attn_probs works accurately
+    test_attention_probs = False
+
+    def _get_dummy_input_ids(self):
+        # fmt: off
+        ids = torch.tensor(
+            [[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 122, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 44, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 98, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73]],  # noqa: E231
+            dtype=torch.long,
+            device=torch_device,
+        )
+        # fmt: on
+        return ids
+
+    def test_inference_block_sparse_pretraining(self):
+        model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="block_sparse")
+        model.to(torch_device)
+
+        input_ids = torch.tensor([[20920, 232, 328, 1437] * 1024], dtype=torch.long, device=torch_device)
+        outputs = model(input_ids)
+        prediction_logits = outputs.prediction_logits
+        seq_relationship_logits = outputs.seq_relationship_logits
+
+        self.assertEqual(prediction_logits.shape, torch.Size((1, 4096, 50358)))
+        self.assertEqual(seq_relationship_logits.shape, torch.Size((1, 2)))
+
+        expected_prediction_logits_slice = torch.tensor(
+            [
+                [-0.2420, -0.6048, -0.0614, 7.8422],
+                [-0.0596, -0.0104, -1.8408, 9.3352],
+                [1.0588, 0.7999, 5.0770, 8.7555],
+                [-0.1385, -1.7199, -1.7613, 6.1094],
+            ],
+            device=torch_device,
+        )
+        self.assertTrue(
+            torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
+        )
+
+        expected_seq_relationship_logits = torch.tensor([[58.8196, 56.3629]], device=torch_device)
+        self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
+
+    def test_inference_full_pretraining(self):
+        model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="original_full")
+        model.to(torch_device)
+
+        input_ids = torch.tensor([[20920, 232, 328, 1437] * 512], dtype=torch.long, device=torch_device)
+        outputs = model(input_ids)
+        prediction_logits = outputs.prediction_logits
+        seq_relationship_logits = outputs.seq_relationship_logits
+
+        self.assertEqual(prediction_logits.shape, torch.Size((1, 512 * 4, 50358)))
+        self.assertEqual(seq_relationship_logits.shape, torch.Size((1, 2)))
+
+        expected_prediction_logits_slice = torch.tensor(
+            [
+                [0.1499, -1.1217, 0.1990, 8.4499],
+                [-2.7757, -3.0687, -4.8577, 7.5156],
+                [1.5446, 0.1982, 4.3016, 10.4281],
+                [-1.3705, -4.0130, -3.9629, 5.1526],
+            ],
+            device=torch_device,
+        )
+        self.assertTrue(
+            torch.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
+        )
+
+        expected_seq_relationship_logits = torch.tensor([[41.4503, 41.2406]], device=torch_device)
+        self.assertTrue(torch.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
+
+    def test_block_sparse_attention_probs(self):
+        """
+        Asserting if outputted attention matrix is similar to hard coded attention matrix
+        """
+
+        if not self.test_attention_probs:
+            return
+
+        model = BigBirdModel.from_pretrained(
+            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
+        )
+        model.to(torch_device)
+        model.eval()
+        config = model.config
+
+        input_ids = self._get_dummy_input_ids()
+
+        hidden_states = model.embeddings(input_ids)
+
+        batch_size, seqlen, _ = hidden_states.size()
+        attn_mask = torch.ones(batch_size, seqlen, device=torch_device, dtype=torch.float)
+        to_seq_length = from_seq_length = seqlen
+        from_block_size = to_block_size = config.block_size
+
+        blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
+            attn_mask, config.block_size
+        )
+        from_blocked_mask = to_blocked_mask = blocked_mask
+
+        for i in range(config.num_hidden_layers):
+            pointer = model.encoder.layer[i].attention.self
+
+            query_layer = pointer.transpose_for_scores(pointer.query(hidden_states))
+            key_layer = pointer.transpose_for_scores(pointer.key(hidden_states))
+            value_layer = pointer.transpose_for_scores(pointer.value(hidden_states))
+
+            context_layer, attention_probs = pointer.bigbird_block_sparse_attention(
+                query_layer,
+                key_layer,
+                value_layer,
+                band_mask,
+                from_mask,
+                to_mask,
+                from_blocked_mask,
+                to_blocked_mask,
+                pointer.num_attention_heads,
+                pointer.num_random_blocks,
+                pointer.attention_head_size,
+                from_block_size,
+                to_block_size,
+                batch_size,
+                from_seq_length,
+                to_seq_length,
+                seed=pointer.seed,
+                plan_from_length=None,
+                plan_num_rand_blocks=None,
+                output_attentions=True,
+            )
+
+            context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
+            cl = torch.einsum("bhqk,bhkd->bhqd", attention_probs, value_layer)
+            cl = cl.view(context_layer.size())
+
+            self.assertTrue(torch.allclose(context_layer, cl, atol=0.001))
+
+    def test_block_sparse_context_layer(self):
+        model = BigBirdModel.from_pretrained(
+            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
+        )
+        model.to(torch_device)
+        model.eval()
+        config = model.config
+
+        input_ids = self._get_dummy_input_ids()
+        dummy_hidden_states = model.embeddings(input_ids)
+
+        attn_mask = torch.ones_like(input_ids, device=torch_device)
+        blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
+            attn_mask, config.block_size
+        )
+        targeted_cl = torch.tensor(
+            [
+                [0.1874, 1.5260, 0.2335, -0.0473, -0.0961, 1.8384, -0.0141, 0.1250, 0.0085, -0.0048],
+                [-0.0554, 0.0728, 0.1683, -0.1332, 0.1741, 0.1337, -0.2380, -0.1849, -0.0390, -0.0259],
+                [-0.0419, 0.0767, 0.1591, -0.1399, 0.1789, 0.1257, -0.2406, -0.1772, -0.0261, -0.0079],
+                [0.1860, 1.5172, 0.2326, -0.0473, -0.0953, 1.8291, -0.0147, 0.1245, 0.0082, -0.0046],
+                [0.1879, 1.5296, 0.2335, -0.0471, -0.0975, 1.8433, -0.0136, 0.1260, 0.0086, -0.0054],
+                [0.1854, 1.5147, 0.2334, -0.0480, -0.0956, 1.8250, -0.0149, 0.1222, 0.0082, -0.0060],
+                [0.1859, 1.5184, 0.2334, -0.0474, -0.0955, 1.8297, -0.0143, 0.1234, 0.0079, -0.0054],
+                [0.1885, 1.5336, 0.2335, -0.0467, -0.0979, 1.8481, -0.0130, 0.1269, 0.0085, -0.0049],
+                [0.1881, 1.5305, 0.2335, -0.0471, -0.0976, 1.8445, -0.0135, 0.1262, 0.0086, -0.0053],
+                [0.1852, 1.5148, 0.2333, -0.0480, -0.0949, 1.8254, -0.0151, 0.1225, 0.0079, -0.0055],
+                [0.1877, 1.5292, 0.2335, -0.0470, -0.0972, 1.8431, -0.0135, 0.1259, 0.0084, -0.0052],
+                [0.1874, 1.5261, 0.2334, -0.0472, -0.0968, 1.8393, -0.0140, 0.1251, 0.0084, -0.0052],
+                [0.1853, 1.5151, 0.2331, -0.0478, -0.0948, 1.8256, -0.0154, 0.1228, 0.0086, -0.0052],
+                [0.1867, 1.5233, 0.2334, -0.0475, -0.0965, 1.8361, -0.0139, 0.1247, 0.0084, -0.0054],
+            ],
+            device=torch_device,
+        )
+
+        context_layer = model.encoder.layer[0].attention.self(
+            dummy_hidden_states,
+            band_mask=band_mask,
+            from_mask=from_mask,
+            to_mask=to_mask,
+            from_blocked_mask=blocked_mask,
+            to_blocked_mask=blocked_mask,
+        )
+        context_layer = context_layer[0]
+
+        self.assertEqual(context_layer.shape, torch.Size((1, 128, 768)))
+        self.assertTrue(torch.allclose(context_layer[0, 64:78, 300:310], targeted_cl, atol=0.0001))
+
+    def test_tokenizer_inference(self):
+        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+        model = BigBirdModel.from_pretrained(
+            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
+        )
+        model.to(torch_device)
+
+        text = [
+            "Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformer’s attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA."
+        ]
+        inputs = tokenizer(text)
+
+        for k in inputs:
+            inputs[k] = torch.tensor(inputs[k], device=torch_device, dtype=torch.long)
+
+        prediction = model(**inputs)
+        prediction = prediction[0]
+
+        self.assertEqual(prediction.shape, torch.Size((1, 199, 768)))
+
+        expected_prediction = torch.tensor(
+            [
+                [-0.0213, -0.2213, -0.0061, 0.0687],
+                [0.0977, 0.1858, 0.2374, 0.0483],
+                [0.2112, -0.2524, 0.5793, 0.0967],
+                [0.2473, -0.5070, -0.0630, 0.2174],
+                [0.2885, 0.1139, 0.6071, 0.2991],
+                [0.2328, -0.2373, 0.3648, 0.1058],
+                [0.2517, -0.0689, 0.0555, 0.0880],
+                [0.1021, -0.1495, -0.0635, 0.1891],
+                [0.0591, -0.0722, 0.2243, 0.2432],
+                [-0.2059, -0.2679, 0.3225, 0.6183],
+                [0.2280, -0.2618, 0.1693, 0.0103],
+                [0.0183, -0.1375, 0.2284, -0.1707],
+            ],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(prediction[0, 52:64, 320:324], expected_prediction, atol=1e-4))
+
+    def test_inference_question_answering(self):
+        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-base-trivia-itc")
+        model = BigBirdForQuestionAnswering.from_pretrained(
+            "google/bigbird-base-trivia-itc", attention_type="block_sparse", block_size=16, num_random_blocks=3
+        )
+        model.to(torch_device)
+
+        context = "The BigBird model was proposed in Big Bird: Transformers for Longer Sequences by Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it has been shown that applying sparse, global, and random attention approximates full attention, while being computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context, BigBird has shown improved performance on various long document NLP tasks, such as question answering and summarization, compared to BERT or RoBERTa."
+
+        question = [
+            "Which is better for longer sequences- BigBird or BERT?",
+            "What is the benefit of using BigBird over BERT?",
+        ]
+        inputs = tokenizer(
+            question,
+            [context, context],
+            padding=True,
+            return_tensors="pt",
+            add_special_tokens=True,
+            max_length=256,
+            truncation=True,
+        )
+
+        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
+
+        start_logits, end_logits = model(**inputs).to_tuple()
+
+        # fmt: off
+        target_start_logits = torch.tensor(
+            [[-8.9304, -10.3849, -14.4997, -9.6497, -13.9469, -7.8134, -8.9687, -13.3585, -9.7987, -13.8869, -9.2632, -8.9294, -13.6721, -7.3198, -9.5434, -11.2641, -14.3245, -9.5705, -12.7367, -8.6168, -11.083, -13.7573, -8.1151, -14.5329, -7.6876, -15.706, -12.8558, -9.1135, 8.0909, -3.1925, -11.5812, -9.4822], [-11.5595, -14.5591, -10.2978, -14.8445, -10.2092, -11.1899, -13.8356, -10.5644, -14.7706, -9.9841, -11.0052, -14.1862, -8.8173, -11.1098, -12.4686, -15.0531, -11.0196, -13.6614, -10.0236, -11.8151, -14.8744, -9.5123, -15.1605, -8.6472, -15.4184, -8.898, -9.6328, -7.0258, -11.3365, -14.4065, -10.2587, -8.9103]],  # noqa: E231
+            device=torch_device,
+        )
+        target_end_logits = torch.tensor(
+            [[-12.4131, -8.5959, -15.7163, -11.1524, -15.9913, -12.2038, -7.8902, -16.0296, -12.164, -16.5017, -13.3332, -6.9488, -15.7756, -13.8506, -11.0779, -9.2893, -15.0426, -10.1963, -17.3292, -12.2945, -11.5337, -16.4514, -9.1564, -17.5001, -9.1562, -16.2971, -13.3199, -7.5724, -5.1175, 7.2168, -10.3804, -11.9873], [-10.8654, -14.9967, -11.4144, -16.9189, -14.2673, -9.7068, -15.0182, -12.8846, -16.8716, -13.665, -10.3113, -15.1436, -14.9069, -13.3364, -11.2339, -16.0118, -11.8331, -17.0613, -13.8852, -12.4163, -16.8978, -10.7772, -17.2324, -10.6979, -16.9811, -10.3427, -9.497, -13.7104, -11.1107, -13.2936, -13.855, -14.1264]],  # noqa: E231
+            device=torch_device,
+        )
+        # fmt: on
+
+        self.assertTrue(torch.allclose(start_logits[:, 64:96], target_start_logits, atol=1e-4))
+        self.assertTrue(torch.allclose(end_logits[:, 64:96], target_end_logits, atol=1e-4))
+
+        input_ids = inputs["input_ids"].tolist()
+        answer = [
+            input_ids[i][torch.argmax(start_logits, dim=-1)[i] : torch.argmax(end_logits, dim=-1)[i] + 1]
+            for i in range(len(input_ids))
+        ]
+        answer = tokenizer.batch_decode(answer)
+
+        self.assertTrue(answer == ["BigBird", "global attention"])
+
+    def test_fill_mask(self):
+        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+        model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
+        model.to(torch_device)
+
+        input_ids = tokenizer("The goal of life is [MASK] .", return_tensors="pt").input_ids.to(torch_device)
+        logits = model(input_ids).logits
+
+        # [MASK] is token at 6th position
+        pred_token = tokenizer.decode(torch.argmax(logits[0, 6:7], axis=-1))
+        self.assertEqual(pred_token, "happiness")
+
+    def test_auto_padding(self):
+        model = BigBirdModel.from_pretrained(
+            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
+        )
+        model.to(torch_device)
+        model.eval()
+
+        input_ids = torch.tensor([200 * [10] + 40 * [2] + [1]], device=torch_device, dtype=torch.long)
+        output = model(input_ids).to_tuple()[0]
+
+        # fmt: off
+        target = torch.tensor(
+            [[-0.045136, -0.068013, 0.12246, -0.01356, 0.018386, 0.025333, -0.0044439, -0.0030996, -0.064031, 0.0006439], [-0.045018, -0.067638, 0.12317, -0.013998, 0.019216, 0.025695, -0.0043705, -0.0031895, -0.063153, 0.00088899], [-0.045042, -0.067305, 0.1234, -0.014512, 0.020057, 0.026084, -0.004615, -0.0031728, -0.062442, 0.0010263], [-0.044589, -0.067655, 0.12416, -0.014287, 0.019416, 0.026065, -0.0050958, -0.002702, -0.063158, 0.0004827], [-0.044627, -0.067535, 0.1239, -0.014319, 0.019491, 0.026213, -0.0059482, -0.0025906, -0.063116, 0.00014669], [-0.044899, -0.067704, 0.12337, -0.014231, 0.019256, 0.026345, -0.0065565, -0.0022938, -0.063433, -0.00011409], [-0.045599, -0.067764, 0.12235, -0.014151, 0.019206, 0.026417, -0.0068965, -0.0024494, -0.063313, -4.4499e-06], [-0.045557, -0.068372, 0.12199, -0.013747, 0.017962, 0.026103, -0.0070607, -0.0023552, -0.06447, -0.00048756], [-0.045334, -0.068913, 0.1217, -0.013566, 0.01693, 0.025745, -0.006311, -0.0024903, -0.065575, -0.0006719], [-0.045171, -0.068726, 0.12164, -0.013688, 0.017139, 0.025629, -0.005213, -0.0029412, -0.065237, -0.00020669], [-0.044411, -0.069267, 0.12206, -0.013645, 0.016212, 0.025589, -0.0044121, -0.002972, -0.066277, -0.00067963], [-0.043487, -0.069792, 0.1232, -0.013663, 0.015303, 0.02613, -0.0036294, -0.0030616, -0.067483, -0.0012642], [-0.042622, -0.069287, 0.12469, -0.013936, 0.016204, 0.026474, -0.0040534, -0.0027365, -0.066994, -0.0014148], [-0.041879, -0.070031, 0.12593, -0.014047, 0.015082, 0.027751, -0.0040683, -0.0027189, -0.068985, -0.0027146]],  # noqa: E231
+            device=torch_device,
+        )
+        # fmt: on
+
+        self.assertEqual(output.shape, torch.Size((1, 241, 768)))
+        self.assertTrue(torch.allclose(output[0, 64:78, 300:310], target, atol=0.0001))
diff --git a/test_modeling_bigbird_pegasus.py b/test_modeling_bigbird_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..4965cbaa248fad7d70557492e662b2162005317d
--- /dev/null
+++ b/test_modeling_bigbird_pegasus.py
@@ -0,0 +1,771 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BigBirdPegasus model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BigBirdPegasusConfig,
+        BigBirdPegasusForCausalLM,
+        BigBirdPegasusForConditionalGeneration,
+        BigBirdPegasusForQuestionAnswering,
+        BigBirdPegasusForSequenceClassification,
+        BigBirdPegasusModel,
+        PegasusTokenizer,
+    )
+    from transformers.models.bigbird_pegasus.modeling_bigbird_pegasus import (
+        BigBirdPegasusDecoder,
+        BigBirdPegasusEncoder,
+    )
+
+MODEL_ID = "google/bigbird-pegasus-large-pubmed"
+
+
+def prepare_bigbird_pegasus_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+
+    input_dict = {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
+    input_dict = {k: input_dict[k].to(torch_device) for k in input_dict}
+    return input_dict
+
+
+@require_torch
+class BigBirdPegasusModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        seq_length=256,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=31,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=260,
+        eos_token_id=1,
+        pad_token_id=0,
+        bos_token_id=2,
+        attention_type="block_sparse",
+        use_bias=False,
+        block_size=16,
+        num_random_blocks=3,
+        scale_embedding=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+        self.attention_type = attention_type
+        self.use_bias = use_bias
+        self.block_size = block_size
+        self.num_random_blocks = num_random_blocks
+        self.scale_embedding = scale_embedding
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BigBirdPegasusConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            attention_type=self.attention_type,
+            use_bias=self.use_bias,
+            block_size=self.block_size,
+            num_random_blocks=self.num_random_blocks,
+            scale_embedding=self.scale_embedding,
+        )
+        inputs_dict = prepare_bigbird_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BigBirdPegasusModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = BigBirdPegasusModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = BigBirdPegasusEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = BigBirdPegasusDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+    def create_and_check_model(self, config, inputs_dict):
+        model = BigBirdPegasusModel(config=config).to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        decoder_input_ids = inputs_dict["decoder_input_ids"]
+        result = model(input_ids, decoder_input_ids=decoder_input_ids, use_cache=True)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+
+@require_torch
+class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            BigBirdPegasusModel,
+            BigBirdPegasusForConditionalGeneration,
+            BigBirdPegasusForSequenceClassification,
+            BigBirdPegasusForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (BigBirdPegasusForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_missing_keys = False
+    test_pruning = False
+    test_head_masking = False
+
+    # torchscript tests are not passing for now.
+    # Also torchscript is not an important feature to have in the beginning.
+    test_torchscript = False
+
+    # overwrite from GenerationTesterMixin to solve problem
+    # with conflicting random seeds
+    def _get_input_ids_and_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.attention_type = "original_full"
+
+        input_ids = inputs_dict[self.input_name]
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+
+        # cut to half length & take max batch_size 3
+        max_batch_size = 2
+        sequence_length = input_ids.shape[-1] // 2
+        input_ids = input_ids[:max_batch_size, :sequence_length]
+        attention_mask = attention_mask[:max_batch_size, :sequence_length]
+
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
+        if config.eos_token_id is not None and config.pad_token_id is None:
+            # hack to allow generate for models such as GPT2 as is done in `generate()`
+            config.pad_token_id = config.eos_token_id
+        return config, input_ids, attention_mask, max_length
+
+    def setUp(self):
+        self.model_tester = BigBirdPegasusModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BigBirdPegasusConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_model_various_attn_type(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["original_full", "block_sparse"]:
+            config_and_inputs[0].attention_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_generate_without_input_ids(self):
+        if self.model_tester.attention_type == "block_sparse":
+            # this test can never pass for BigBird-block-sparse attention since input_ids must be multiple of block_size
+            return
+        super().test_generate_without_input_ids()
+
+    def test_retain_grad_hidden_states_attentions(self):
+        if self.model_tester.attention_type == "block_sparse":
+            # this test can't pass since attention matrix (which is getting returned) can't have gradients (& just 0 at many locations)
+            return
+        super().test_retain_grad_hidden_states_attentions()
+
+    # BigBirdPegasusForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (
+            BigBirdPegasusModel,
+            BigBirdPegasusForConditionalGeneration,
+            BigBirdPegasusForQuestionAnswering,
+        ):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_dict.pop("decoder_attention_mask")
+        input_dict.pop("decoder_input_ids")
+        model = BigBirdPegasusForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(**input_dict)
+        model.generate(**input_dict, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    @slow
+    def test_batched_forward_original_full(self):
+        self._check_batched_forward(attn_type="original_full")
+
+    @slow
+    def test_batched_forward_block_sparse(self):
+        self._check_batched_forward(attn_type="block_sparse", tolerance=1e-1)
+
+    def _check_batched_forward(self, attn_type, tolerance=1e-3):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        config.max_position_embeddings = 128
+        config.block_size = 16
+        config.attention_type = attn_type
+        model = BigBirdPegasusForConditionalGeneration(config).to(torch_device)
+        model.eval()
+
+        chunk_length = 32
+
+        sample_with_padding = [3, 8, 11] * chunk_length + [0] * chunk_length
+        sample_without_padding = [4, 7, 9, 13] * chunk_length
+        target_ids_without_padding = [2, 3] * 8
+        target_ids_with_padding = [7, 8] * 6 + 4 * [-100]
+
+        attention_mask = torch.tensor(
+            [[1] * 3 * chunk_length + [0] * chunk_length, [1] * 4 * chunk_length],
+            device=torch_device,
+            dtype=torch.long,
+        )
+
+        input_ids = torch.tensor([sample_with_padding, sample_without_padding], device=torch_device, dtype=torch.long)
+        labels = torch.tensor(
+            [target_ids_without_padding, target_ids_with_padding], device=torch_device, dtype=torch.long
+        )
+
+        with torch.no_grad():
+            logits_batched = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).logits
+
+        with torch.no_grad():
+            logits_single_first = model(input_ids=input_ids[:1, :-chunk_length], labels=labels[:1]).logits
+
+        self.assertTrue(torch.allclose(logits_batched[0, -3:], logits_single_first[0, -3:], atol=tolerance))
+
+        with torch.no_grad():
+            logits_single_second = model(input_ids=input_ids[1:], labels=labels[1:, :-4]).logits
+
+        self.assertTrue(torch.allclose(logits_batched[1, :3], logits_single_second[0, :3], atol=tolerance))
+
+    def test_auto_padding(self):
+        ids = [[7, 6, 9] * 65]
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        input_ids = torch.tensor(ids, device=torch_device, dtype=torch.long)
+        attention_mask = input_ids.new_ones(input_ids.shape)
+        decoder_input_ids = torch.tensor([[33, 5, 8] * 3], device=torch_device, dtype=torch.long)
+
+        config.block_size = 8
+        model = BigBirdPegasusForConditionalGeneration(config).eval().to(torch_device)
+        output1 = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[
+            "logits"
+        ]
+
+        ids = [[7, 6, 9] * 65 + [0] * 5]
+        input_ids = torch.tensor(ids, device=torch_device, dtype=torch.long)
+        attention_mask = torch.tensor([[1] * 3 * 65 + [0] * 5], device=torch_device, dtype=torch.long)
+        output2 = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[
+            "logits"
+        ]
+
+        self.assertTrue(torch.allclose(output1, output2, atol=1e-5))
+
+    def test_for_change_to_full_attn(self):
+        self.model_tester.seq_length = 9
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+
+        # automatic switch will happen
+        config.attention_type = "block_sparse"
+        model = BigBirdPegasusForConditionalGeneration(config).eval().to(torch_device)
+        state_dict = model.state_dict()
+        outputs1 = model(**input_dict)["logits"]
+
+        config.attention_type = "original_full"
+        model = BigBirdPegasusForConditionalGeneration(config).eval().to(torch_device)
+        model.load_state_dict(state_dict)
+        outputs2 = model(**input_dict)["logits"]
+
+        self.assertTrue(torch.allclose(outputs1, outputs2, atol=1e-5))
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class BigBirdPegasusModelIntegrationTests(unittest.TestCase):
+    def _get_dummy_input_ids(self):
+        # fmt: off
+        ids = torch.tensor(
+            [[685, 560, 630, 193, 836, 764, 708, 360, 10, 724, 278, 755, 805, 600, 71, 473, 601, 397, 315, 706, 487, 552, 88, 175, 601, 850, 678, 538, 846, 73, 778, 917, 116, 977, 756, 710, 1023, 848, 432, 449, 851, 100, 985, 178, 756, 798, 660, 148, 911, 424, 289, 962, 266, 698, 640, 545, 544, 715, 245, 152, 676, 511, 460, 883, 184, 29, 803, 129, 129, 933, 54, 902, 551, 489, 757, 274, 336, 389, 618, 43, 443, 544, 889, 258, 322, 1000, 938, 58, 292, 871, 120, 780, 431, 83, 92, 897, 399, 612, 566, 909, 634, 939, 85, 204, 325, 775, 965, 48, 640, 1013, 132, 973, 869, 181, 1001, 847, 144, 661, 228, 955, 792, 720, 910, 374, 854, 561, 306, 582, 170, 676, 449, 96, 198, 607, 257, 882, 691, 293, 931, 817, 862, 388, 611, 555, 974, 369, 1000, 918, 202, 384, 513, 907, 371, 556, 955, 384, 24, 700, 131, 378, 99, 575, 932, 735, 124, 964, 595, 943, 740, 149, 210, 563, 412, 783, 42, 59, 706, 37, 779, 87, 44, 873, 12, 771, 308, 81, 33, 183, 129, 807, 276, 175, 555, 372, 185, 445, 489, 590, 287, 281, 638, 771, 516, 95, 227, 876, 270, 881, 297, 329, 20, 608, 841, 411, 451, 249, 181, 324, 1005, 830, 783, 865, 261, 964, 750, 140, 1021, 599, 462, 890, 622, 844, 697, 529, 153, 926, 150, 111, 26, 465, 957, 890, 887, 118, 446, 596, 674, 873, 929, 229, 508, 764, 122, 327, 470, 288, 526, 840, 697, 153, 592, 42, 275, 553, 439, 208, 780, 167, 112, 350, 1018, 130, 736, 887, 813, 217, 382, 25, 68, 979, 1008, 772, 235, 717, 999, 292, 727, 1023, 702, 710, 728, 556, 33, 12, 617, 213, 139, 695, 1004, 422, 638, 669, 624, 489, 771, 540, 980, 218, 664, 822, 308, 175, 149, 950, 542, 580, 548, 808, 394, 74, 298, 920, 900, 815, 731, 947, 877, 772, 800, 778, 395, 540, 430, 200, 424, 62, 342, 866, 45, 803, 931, 89, 34, 646, 233, 768, 37, 769, 460, 291, 198, 895, 950, 255, 81, 447, 137, 190, 130, 210, 369, 292, 377, 348, 169, 885, 805, 177, 538, 324, 872, 509, 804, 115, 799, 30, 754, 290, 147, 274, 222, 341, 510, 515, 70, 358, 909, 557, 886, 766, 323, 624, 92, 342, 424, 552, 972, 663, 415, 658, 711, 968, 275, 861, 44, 84, 434, 810, 94, 175, 406, 202, 858, 499, 481, 988, 330, 541, 1004, 210, 618, 955, 897, 983, 576, 17, 107, 165, 607, 537, 629, 192, 196, 308, 137, 953, 860, 94, 892, 751, 88, 161, 148, 585, 456, 88, 14, 315, 594, 121, 885, 952, 833, 716, 733, 933, 282, 801, 427, 783, 471, 285, 277, 979, 325, 535, 228, 891, 596, 648, 969, 574, 654, 518, 257, 137, 208, 464, 950, 140, 5, 424, 349, 942, 283, 587, 821, 1007, 434, 220, 820, 740, 874, 787, 374, 291, 564, 671, 438, 827, 940, 824, 509, 1021, 787, 942, 856, 450, 327, 491, 54, 817, 95, 60, 337, 667, 637, 164, 571, 946, 107, 202, 301, 782, 890, 839, 551, 680, 649, 14, 1017, 904, 721, 1017, 535, 505, 848, 986, 777, 740, 775, 210, 456, 469, 474, 963, 573, 401, 57, 883, 750, 664, 281, 5, 613, 1005, 306, 344, 543, 567, 154, 789, 354, 358, 698, 408, 412, 30, 930, 372, 822, 632, 948, 855, 503, 8, 618, 1010, 138, 695, 897, 852, 377, 933, 722, 149, 886, 1009, 260, 127, 811, 578, 533, 805, 325, 977, 113, 944, 651, 238, 361, 991, 860, 556, 64, 928, 917, 455, 266, 445, 604, 624, 420, 340, 845, 275, 370, 843, 227, 226, 940, 644, 909, 229, 827, 898, 370, 129, 808, 25, 699, 293, 356, 838, 135, 4, 227, 890, 681, 445, 418, 285, 837, 27, 737, 249, 366, 948, 202, 438, 198, 930, 648, 638, 607, 73, 247, 853, 136, 708, 214, 476, 621, 324, 103, 853, 328, 596, 224, 257, 646, 348, 108, 927, 970, 980, 520, 150, 998, 477, 393, 684, 559, 1, 361, 692, 551, 90, 75, 500, 739, 636, 344, 97, 852, 283, 719, 33, 116, 455, 866, 429, 828, 826, 691, 174, 746, 133, 442, 94, 348, 402, 420, 707, 405, 942, 186, 976, 376, 677, 874, 703, 517, 498, 499, 206, 415, 366, 856, 739, 420, 586, 219, 952, 539, 375, 23, 461, 720, 355, 603, 52, 999, 815, 721, 574, 445, 816, 1019, 105, 641, 395, 972, 910, 328, 607, 519, 686, 246, 415, 528, 170, 167, 310, 940, 595, 392, 221, 834, 682, 835, 115, 861, 335, 742, 220, 247, 101, 416, 222, 179, 509, 175, 606, 627, 674, 781, 737, 746, 849, 67, 457, 1012, 126, 139, 625, 731, 156, 697, 121, 322, 449, 710, 857, 291, 976, 4, 701, 239, 678, 172, 724, 857, 583, 661, 903, 797, 628, 903, 835, 605, 989, 615, 870, 380, 710, 110, 330, 101, 695, 846, 918, 508, 672, 594, 36, 238, 244, 251, 393, 767, 282, 22, 430, 230, 983, 401, 154, 1007, 120, 678, 896, 386, 390, 711, 397, 347, 587, 1020, 951, 79, 831, 585, 200, 814, 134, 560, 700, 171, 452, 139, 755, 314, 476, 346, 388, 126, 719, 851, 198, 699, 901, 18, 710, 448, 351, 665, 644, 326, 425, 165, 571, 178, 440, 665, 674, 915, 866, 463, 754, 136, 950, 748, 47, 497, 1013, 640, 930, 338, 158, 525, 631, 815, 887, 289, 803, 116, 600, 637, 410, 175, 499, 876, 565, 1002, 623, 577, 333, 887, 586, 147, 773, 776, 644, 49, 77, 294, 117, 494, 561, 110, 979, 180, 562, 72, 859, 434, 1007, 286, 516, 75, 597, 491, 322, 888, 533, 209, 43, 499, 29, 411, 856, 181, 305, 963, 615, 778, 259, 373, 877, 746, 858, 381, 886, 613, 91, 69, 618, 523, 13, 617, 226, 422, 168, 929, 379, 290, 923, 100, 218, 307, 345, 211, 789, 735, 669, 585, 275, 410, 921, 552, 235, 636, 285, 665, 659, 708, 173, 724, 302, 823, 1, 139, 708, 903, 732, 868, 442, 967, 916, 163, 51, 243, 871]],  # noqa: E231
+            dtype=torch.long,
+            device=torch_device,
+        )
+        # fmt: on
+        return ids
+
+    def _get_dummy_target_ids(self):
+        # fmt: off
+        ids = torch.tensor(
+            [[13, 6, 1, 4, 12, 4, 8, 10, 4, 6, 3, 5, 8, 7, 9, 9]],  # noqa: E231
+            dtype=torch.long,
+            device=torch_device,
+        )
+        # fmt: on
+        return ids
+
+    def test_inference_block_sparse(self):
+        model = BigBirdPegasusForConditionalGeneration.from_pretrained(
+            MODEL_ID, attention_type="block_sparse", block_size=16, num_random_blocks=3
+        )
+        model.to(torch_device)
+
+        input_ids = self._get_dummy_input_ids()
+        target_ids = self._get_dummy_target_ids()
+
+        outputs = model(input_ids, labels=target_ids)
+        prediction_logits = outputs.logits
+
+        self.assertEqual(prediction_logits.shape, torch.Size((1, 16, 96103)))
+        # fmt: off
+        expected_prediction_logits_slice = torch.tensor(
+            [[1.7769, 5.8479, 6.2375, 2.2745, 8.6157, 4.7483, 5.0647, 6.5358, 2.3393, 7.8333, 3.8403, 0.0255, 7.219, 5.2759, 3.097, 6.387, 4.9341, 7.1409, 5.1179, 0.1144, 6.8268, 0.7598, 0.6258, 2.373, 0.4627, -1.9919, 1.8422, 3.4578], [1.8026, 5.9604, 5.954, 2.8642, 9.0608, 4.394, 5.3779, 7.0216, 1.543, 7.8744, 4.4231, -0.0398, 7.6091, 5.6611, 3.3536, 6.8624, 4.7699, 6.5241, 4.8893, 0.5791, 6.8368, 0.1034, 0.0338, 2.9393, 0.5034, -2.5509, 2.0172, 3.2858], [1.8426, 5.9151, 5.5374, 3.0426, 9.1762, 3.6287, 5.3916, 7.4621, 1.2582, 7.9244, 4.694, -0.1308, 7.4725, 5.5385, 3.4598, 7.0422, 4.2455, 5.797, 4.5927, 0.7478, 6.7467, -0.2695, -0.3207, 3.0269, 0.4714, -2.8134, 2.0406, 3.1089], [1.6527, 5.8416, 5.4558, 3.0044, 9.3478, 3.2607, 5.3887, 7.52, 0.9362, 7.8877, 4.8465, -0.1705, 7.3932, 5.6352, 3.5744, 7.2623, 4.0485, 5.2788, 4.5859, 0.8325, 6.6088, -0.3676, -0.6287, 3.1731, 0.4483, -3.1573, 2.0522, 2.8868]],  # noqa: E231
+            device=torch_device,
+        )
+        # fmt: on
+        self.assertTrue(
+            torch.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
+        )
+
+    def test_inference_full_attn(self):
+        model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID, attention_type="original_full")
+        model.to(torch_device)
+
+        input_ids = self._get_dummy_input_ids()
+        target_ids = self._get_dummy_target_ids()
+
+        outputs = model(input_ids, labels=target_ids)
+        prediction_logits = outputs.logits
+
+        self.assertEqual(prediction_logits.shape, torch.Size((1, 16, 96103)))
+        # fmt: off
+        expected_prediction_logits_slice = torch.tensor(
+            [[1.3418, 5.8304, 6.5662, 2.0448, 8.7702, 4.6579, 4.9947, 6.429, 2.4296, 7.9431, 4.217, 0.0672, 7.334, 5.1966, 2.9603, 6.0814, 4.6756, 7.5522, 5.076, 0.213, 6.6638, 0.6577, 0.244, 2.1221, 0.7531, -2.4076, 1.8731, 3.5594], [1.5525, 6.0524, 6.309, 2.6245, 9.229, 4.5213, 5.0913, 7.0622, 1.7992, 8.0962, 4.7994, -0.0248, 7.7168, 5.5878, 3.0883, 6.5248, 4.7895, 6.9974, 4.8787, 0.5445, 6.6686, 0.0102, -0.1659, 2.6195, 0.7389, -2.8956, 1.9928, 3.3777], [1.6407, 6.2104, 6.0331, 2.8076, 9.4074, 3.9772, 5.0574, 7.5316, 1.4201, 8.3035, 5.0212, -0.1031, 7.553, 5.5023, 3.1427, 6.7674, 4.4409, 6.457, 4.525, 0.728, 6.5422, -0.6234, -0.4726, 2.7486, 0.6985, -3.0804, 1.9669, 3.2365], [1.5065, 6.1271, 5.8296, 2.8405, 9.5649, 3.6834, 5.1214, 7.546, 0.9758, 8.3335, 5.1952, -0.1395, 7.4348, 5.6893, 3.2942, 7.0356, 4.1665, 5.9695, 4.3898, 0.8931, 6.3988, -0.8957, -0.7522, 2.8924, 0.6498, -3.4358, 1.8654, 2.9735]],  # noqa: E231
+            device=torch_device,
+        )
+        # fmt: on
+        self.assertTrue(
+            torch.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
+        )
+
+    def test_seq_to_seq_generation(self):
+        MODEL_ID = "google/bigbird-pegasus-large-arxiv"
+        model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID).to(torch_device)
+        tokenizer = PegasusTokenizer.from_pretrained(MODEL_ID)
+
+        ARTICLE_LEP = r"""the lep experiments at the resonance of @xmath1-boson have tested the standard model ( sm ) at quantum level , measuring the @xmath1-decay into fermion pairs with an accuracy of one part in ten thousands . the good agreement of the lep data with the sm predictions have severely constrained the behavior of new physics at the @xmath1-pole . taking these achievements into account one can imagine that the physics of @xmath1-boson will again play the central role in the frontier of particle physics if the next generation @xmath1 factory comes true with the generated @xmath1 events several orders of magnitude higher than that of the lep . this factory can be realized in the gigaz option of the international linear collider ( ilc)@xcite . the ilc is a proposed electron - positron collider with tunable energy ranging from @xmath12 to @xmath13 and polarized beams in its first phase , and the gigaz option corresponds to its operation on top of the resonance of @xmath1 boson by adding a bypass to its main beam line . given the high luminosity , @xmath14 , and the cross section at the resonance of @xmath1 boson , @xmath15 , about @xmath16 @xmath1 events can be generated in an operational year of @xmath17 of gigaz , which implies that the expected sensitivity to the branching ratio of @xmath1-decay can be improved from @xmath18 at the lep to @xmath19 at the gigaz@xcite . in light of this , the @xmath1-boson properties , especially its exotic or rare decays which are widely believed to be sensitive to new physics , should be investigated comprehensively to evaluate their potential in probing new physics .    among the rare @xmath1-decays , the flavor changing ( fc ) processes were most extensively studied to explore the flavor texture in new physics @xcite , and it was found that , although these processes are severely suppressed in the sm , their branching ratios in new physics models can be greatly enhanced to @xmath19 for lepton flavor violation decays @xcite and @xmath20 for quark flavor violation decays @xcite . besides the fc processes , the @xmath1-decay into light higgs boson(s ) is another type of rare process that was widely studied , e.g. the decay @xmath21 ( @xmath22 ) with the particle @xmath0 denoting a light higgs boson was studied in @xcite , the decay @xmath23 was studied in the two higgs doublet model ( 2hdm)@xcite and the minimal supersymmetric standard model ( mssm)@xcite , and the decay @xmath4 was studied in a model independent way @xcite , in 2hdm@xcite and also in mssm@xcite . these studies indicate that , in contrast with the kinematic forbidden of these decays in the sm , the rates of these decays can be as large as @xmath18 in new physics models , which lie within the expected sensitivity of the gigaz . in this work , we extend the previous studies of these decays to some new models and investigate these decays altogether . we are motivated by some recent studies on the singlet extension of the mssm , such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly minimal supersymmetric standard model ( nmssm ) @xcite , where a light cp - odd higgs boson @xmath0 with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry like @xmath24 or peccei - quuin symmetry @xcite . these non - minimal supersymmetric models can not only avoid the @xmath25-problem , but also alleviate the little hierarchy by having such a light higgs boson @xmath0 @xcite . we are also motivated by that , with the latest experiments , the properties of the light higgs boson are more stringently constrained than before . so it is worth updating the previous studies . so far there is no model - independent lower bound on the lightest higgs boson mass . in the sm , it must be heavier than @xmath26 gev , obtained from the null observation of the higgs boson at lep experiments . however , due to the more complex structure of the higgs sector in the extensions of the sm , this lower bound can be significantly relaxed according to recent studies , e.g. , for the cp - odd higgs boson @xmath0 we have @xmath27 gev in the nmssm @xcite , @xmath28 gev in the nmssm @xcite , and @xmath29 gev in the lepton - specific 2hdm ( l2hdm ) @xcite . with such a light cp - odd higgs boson , the z - decay into one or more @xmath0 is open up . noting that the decay @xmath30 is forbidden due to bose symmetry , we in this work study the rare @xmath1-decays @xmath6 ( @xmath22 ) , @xmath31 and @xmath4 in a comparative way for four models , namely the type - ii 2hdm@xcite , the l2hdm @xcite , the nmssm and the nmssm . in our study , we examine carefully the constraints on the light @xmath0 from many latest experimental results . this work is organized as follows . in sec . ii we briefly describe the four new physics models . in sec . iii we present the calculations of the rare @xmath1-decays . in sec . iv we list the constraints on the four new physics models . in sec . v we show the numerical results for the branching ratios of the rare @xmath1-decays in various models . finally , the conclusion is given in sec . as the most economical way , the sm utilizes one higgs doublet to break the electroweak symmetry . as a result , the sm predicts only one physical higgs boson with its properties totally determined by two free parameters . in new physics models , the higgs sector is usually extended by adding higgs doublets and/or singlets , and consequently , more physical higgs bosons are predicted along with more free parameters involved in . the general 2hdm contains two @xmath32 doublet higgs fields @xmath33 and @xmath34 , and with the assumption of cp - conserving , its scalar potential can be parameterized as@xcite : @xmath35,\end{aligned}\ ] ] where @xmath36 ( @xmath37 ) are free dimensionless parameters , and @xmath38 ( @xmath39 ) are the parameters with mass dimension . after the electroweak symmetry breaking , the spectrum of this higgs sector includes three massless goldstone modes , which become the longitudinal modes of @xmath40 and @xmath1 bosons , and five massive physical states : two cp - even higgs bosons @xmath41 and @xmath42 , one neutral cp - odd higgs particle @xmath0 and a pair of charged higgs bosons @xmath43 . noting the constraint @xmath44 with @xmath45 and @xmath46 denoting the vacuum expectation values ( vev ) of @xmath33 and @xmath34 respectively , we choose @xmath47 as the input parameters with @xmath48 , and @xmath49 being the mixing angle that diagonalizes the mass matrix of the cp - even higgs fields . the difference between the type - ii 2hdm and the l2hdm comes from the yukawa coupling of the higgs bosons to quark / lepton . in the type - ii 2hdm , one higgs doublet @xmath34 generates the masses of up - type quarks and the other doublet @xmath33 generates the masses of down - type quarks and charged leptons ; while in the l2hdm one higgs doublet @xmath33 couples only to leptons and the other doublet @xmath34 couples only to quarks . so the yukawa interactions of @xmath0 to fermions in these two models are given by @xcite @xmath50 with @xmath51 denoting generation index . obviously , in the type - ii 2hdm the @xmath52 coupling and the @xmath53 coupling can be simultaneously enhanced by @xmath54 , while in the l2hdm only the @xmath53 coupling is enhanced by @xmath55 . the structures of the nmssm and the nmssm are described by their superpotentials and corresponding soft - breaking terms , which are given by @xcite @xmath56 where @xmath57 is the superpotential of the mssm without the @xmath25 term , @xmath58 and @xmath59 are higgs doublet and singlet superfields with @xmath60 and @xmath61 being their scalar component respectively , @xmath62 , @xmath63 , @xmath64 , @xmath65 , @xmath66 and @xmath67 are soft breaking parameters , and @xmath68 and @xmath69 are coefficients of the higgs self interactions .    with the superpotentials and the soft - breaking terms , one can get the higgs potentials of the nmssm and the nmssm respectively . like the 2hdm , the higgs bosons with same cp property will mix and the mass eigenstates are obtained by diagonalizing the corresponding mass matrices : @xmath70 where the fields on the right hands of the equations are component fields of @xmath71 , @xmath72 and @xmath61 defined by @xmath73 @xmath74 and @xmath75 are respectively the cp - even and cp - odd neutral higgs bosons , @xmath76 and @xmath77 are goldstone bosons eaten by @xmath1 and @xmath78 , and @xmath79 is the charged higgs boson . so both the nmssm and nmssm predict three cp - even higgs bosons , two cp - odd higgs bosons and one pair of charged higgs bosons . in general , the lighter cp - odd higgs @xmath0 in these model is the mixture of the singlet field @xmath80 and the doublet field combination , @xmath81 , i.e. @xmath82 and its couplings to down - type quarks are then proportional to @xmath83 . so for singlet dominated @xmath0 , @xmath84 is small and the couplings are suppressed . as a comparison , the interactions of @xmath0 with the squarks are given by@xcite @xmath85 i.e. the interaction does not vanish when @xmath86 approaches zero . just like the 2hdm where we use the vevs of the higgs fields as fundamental parameters , we choose @xmath68 , @xmath69 , @xmath87 , @xmath88 , @xmath66 and @xmath89 as input parameters for the nmssm@xcite and @xmath68 , @xmath54 , @xmath88 , @xmath65 , @xmath90 and @xmath91 as input parameters for the nmssm@xcite . about the nmssm and the nmssm , three points should be noted . the first is for the two models , there is no explicit @xmath92term , and the effective @xmath25 parameter ( @xmath93 ) is generated when the scalar component of @xmath59 develops a vev . the second is , the nmssm is actually same as the nmssm with @xmath94@xcite , because the tadpole terms @xmath95 and its soft breaking term @xmath96 in the nmssm do not induce any interactions , except for the tree - level higgs boson masses and the minimization conditions . and the last is despite of the similarities , the nmssm has its own peculiarity , which comes from its neutralino sector . in the basis @xmath97 , its neutralino mass matrix is given by @xcite @xmath98 where @xmath99 and @xmath100 are @xmath101 and @xmath102 gaugino masses respectively , @xmath103 , @xmath104 , @xmath105 and @xmath106 . after diagonalizing this matrix one can get the mass eigenstate of the lightest neutralino @xmath107 with mass taking the following form @xcite @xmath108 this expression implies that @xmath107 must be lighter than about @xmath109 gev for @xmath110 ( from lower bound on chargnio mass ) and @xmath111 ( perturbativity bound ) . like the other supersymmetric models , @xmath107 as the lightest sparticle acts as the dark matter in the universe , but due to its singlino - dominated nature , it is difficult to annihilate sufficiently to get the correct density in the current universe . so the relic density of @xmath107 plays a crucial way in selecting the model parameters . for example , as shown in @xcite , for @xmath112 , there is no way to get the correct relic density , and for the other cases , @xmath107 mainly annihilates by exchanging @xmath1 boson for @xmath113 , or by exchanging a light cp - odd higgs boson @xmath0 with mass satisfying the relation @xmath114 for @xmath115 . for the annihilation , @xmath54 and @xmath25 are required to be less than 10 and @xmath116 respectively because through eq.([mass - exp ] ) a large @xmath87 or @xmath25 will suppress @xmath117 to make the annihilation more difficult . the properties of the lightest cp - odd higgs boson @xmath0 , such as its mass and couplings , are also limited tightly since @xmath0 plays an important role in @xmath107 annihilation . the phenomenology of the nmssm is also rather special , and this was discussed in detail in @xcite . in the type - ii 2hdm , l2hdm , nmssm and nmssm , the rare @xmath1-decays @xmath118 ( @xmath22 ) , @xmath3 and @xmath4 may proceed by the feynman diagrams shown in fig.[fig1 ] , fig.[fig2 ] and fig.[fig3 ] respectively . for these diagrams , the intermediate state @xmath119 represents all possible cp - even higgs bosons in the corresponding model , i.e. @xmath41 and @xmath42 in type - ii 2hdm and l2hdm and @xmath41 , @xmath42 and @xmath120 in nmssm and nmssm . in order to take into account the possible resonance effects of @xmath119 in fig.[fig1](c ) for @xmath2 and fig.[fig3 ] ( a ) for @xmath11 , we have calculated all the decay modes of @xmath119 and properly included the width effect in its propagator . as to the decay @xmath121 , two points should be noted . one is , unlike the decays @xmath6 and @xmath11 , this process proceeds only through loops mediated by quarks / leptons in the type - ii 2hdm and l2hdm , and additionally by sparticles in the nmssm and nmssm . so in most cases its rate should be much smaller than the other two . the other is due to cp - invariance , loops mediated by squarks / sleptons give no contribution to the decay@xcite . in actual calculation , this is reflected by the fact that the coupling coefficient of @xmath122 differs from that of @xmath123 by a minus sign ( see eq.([asqsq ] ) ) , and as a result , the squark - mediated contributions to @xmath121 are completely canceled out .    with regard to the rare decay @xmath11 , we have more explanations . in the lowest order , this decay proceeds by the diagram shown in fig.[fig3 ] ( a ) , and hence one may think that , as a rough estimate , it is enough to only consider the contributions from fig.[fig3](a ) . however , we note that in some cases of the type - ii 2hdm and l2hdm , due to the cancelation of the contributions from different @xmath119 in fig.[fig3 ] ( a ) and also due to the potentially largeness of @xmath124 couplings ( i.e. larger than the electroweak scale @xmath125 ) , the radiative correction from the higgs - mediated loops may dominate over the tree level contribution even when the tree level prediction of the rate , @xmath126 , exceeds @xmath20 . on the other hand , we find the contribution from quark / lepton - mediated loops can be safely neglected if @xmath127 in the type - ii 2hdm and the l2hdm . in the nmssm and the nmssm , besides the corrections from the higgs- and quark / lepton - mediated loops , loops involving sparticles such as squarks , charginos and neutralinos can also contribute to the decay . we numerically checked that the contributions from squarks and charginos can be safely neglected if @xmath127 . we also calculated part of potentially large neutralino correction ( note that there are totally about @xmath128 diagrams for such correction ! ) and found they can be neglected too . since considering all the radiative corrections will make our numerical calculation rather slow , we only include the most important correction , namely that from higgs - mediated loops , in presenting our results for the four models . one can intuitively understand the relative smallness of the sparticle contribution to @xmath11 as follows . first consider the squark contribution which is induced by the @xmath129 interaction ( @xmath130 denotes the squark in chirality state ) and the @xmath131 interaction through box diagrams . because the @xmath132 interaction conserves the chirality of the squarks while the @xmath133 interaction violates the chirality , to get non - zero contribution to @xmath11 from the squark loops , at least four chiral flippings are needed , with three of them provided by @xmath131 interaction and the rest provided by the left - right squark mixing . this means that , if one calculates the amplitude in the chirality basis with the mass insertion method , the amplitude is suppressed by the mixing factor @xmath134 with @xmath135 being the off diagonal element in squark mass matrix . next consider the chargino / neutralino contributions . since for a light @xmath0 , its doublet component , parameterized by @xmath84 in eq.([mixing ] ) , is usually small , the couplings of @xmath0 with the sparticles will never be tremendously large@xcite . so the chargino / neutralino contributions are not important too . in our calculation of the decays , we work in the mass eigenstates of sparticles instead of in the chirality basis . for the type - ii 2hdm and the l2hdm , we consider the following constraints @xcite :    * theoretical constraints on @xmath136 from perturbativity , unitarity and requirements that the scalar potential is finit at large field values and contains no flat directions @xcite , which imply that @xmath137 * the constraints from the lep search for neutral higgs bosons . we compute the signals from the higgs - strahlung production @xmath138 ( @xmath139 ) with @xmath140 @xcite and from the associated production @xmath141 with @xmath142 @xcite , and compare them with the corresponding lep data which have been inputted into our code . we also consider the constraints from @xmath138 by looking for a peak of @xmath143 recoil mass distribution of @xmath1-boson @xcite and the constraint of @xmath144 mev when @xmath145 @xcite . + these constraints limit the quantities such as @xmath146 \times br ( h_i \to \bar{b } b ) $ ] on the @xmath147 plane with the the subscript @xmath148 denoting the coupling coefficient of the @xmath149 interaction . they also impose a model - dependent lower bound on @xmath150 , e.g. , @xmath151 for the type - ii 2hdm ( from our scan results ) , @xmath152 for the l2hdm@xcite , and @xmath153 for the nmssm @xcite . these bounds are significantly lower than that of the sm , i.e. @xmath154 , partially because in new physics models , unconventional decay modes of @xmath155 such as @xmath156 are open up . as to the nmssm , another specific reason for allowing a significantly lighter cp - even higgs boson is that the boson may be singlet - dominated in this model . + with regard to the lightest cp - odd higgs boson @xmath0 , we checked that there is no lower bound on its mass so long as the @xmath157 interaction is weak or @xmath155 is sufficiently heavy . * the constraints from the lep search for a light higgs boson via the yukawa process @xmath158 with @xmath22 and @xmath61 denoting a scalar @xcite . these constraints can limit the @xmath159 coupling versus @xmath160 in new physics models . * the constraints from the cleo - iii limit on @xmath161 and the latest babar limits on @xmath162 . these constraints will put very tight constraints on the @xmath163 coupling for @xmath164 . in our analysis , we use the results of fig.8 in the second paper of @xcite to excluded the unfavored points . * the constraints from @xmath165 couplings . since the higgs sector can give sizable higher order corrections to @xmath165 couplings , we calculate them to one loop level and require the corrected @xmath165 couplings to lie within the @xmath166 range of their fitted value . the sm predictions for the couplings at @xmath1-pole are given by @xmath167 and @xmath168 @xcite , and the fitted values are given by @xmath169 and @xmath170 , respectively@xcite . we adopt the formula in @xcite to the 2hdm in our calculation . * the constraints from @xmath171 leptonic decay . we require the new physics correction to the branching ratio @xmath172 to be in the range of @xmath173 @xcite . we use the formula in @xcite in our calculation . + about the constraints ( 5 ) and ( 6 ) , two points should be noted . one is all higgs bosons are involved in the constraints by entering the self energy of @xmath171 lepton , the @xmath174 vertex correction or the @xmath175 vertex correction , and also the box diagrams for @xmath176@xcite . since the yukawa couplings of the higgs bosons to @xmath171 lepton get enhanced by @xmath54 and so do the corrections , @xmath54 must be upper bounded for given spectrum of the higgs sector . generally speaking , the lighter @xmath0 is , the more tightly @xmath54 is limited@xcite . the other point is in the type - ii 2hdm , @xmath177 , b - physics observables as well as @xmath178 decays discussed above can constraint the model in a tighter way than the constraints ( 5 ) and ( 6 ) since the yukawa couplings of @xmath171 lepton and @xmath179 quark are simultaneously enhanced by @xmath54 . but for the l2hdm , because only the yukawa couplings of @xmath171 lepton get enhanced ( see eq.[yukawa ] ) , the constraints ( 5 ) and ( 6 ) are more important in limiting @xmath54 . * indirect constraints from the precision electroweak observables such as @xmath180 , @xmath181 and @xmath182 , or their combinations @xmath183 @xcite . we require @xmath184 to be compatible with the lep / sld data at @xmath185 confidence level@xcite . we also require new physics prediction of @xmath186 is within the @xmath187 range of its experimental value . the latest results for @xmath188 are @xmath189 ( measured value ) and @xmath190 ( sm prediction ) for @xmath191 gev @xcite . in our code , we adopt the formula for these observables presented in @xcite to the type - ii 2hdm and the l2hdm respectively . + in calculating @xmath180 , @xmath181 and @xmath182 , we note that these observables get dominant contributions from the self energies of the gauge bosons @xmath1 , @xmath192 and @xmath193 . since there is no @xmath194 coupling or @xmath195 coupling , @xmath0 must be associated with the other higgs bosons to contribute to the self energies . so by the uv convergence of these quantities , one can infer that , for the case of a light @xmath0 and @xmath196 , these quantities depend on the spectrum of the higgs sector in a way like @xmath197 at leading order , which implies that a light @xmath0 can still survive the constraints from the precision electroweak observables given the splitting between @xmath150 and @xmath198 is moderate@xcite . * the constraints from b physics observables such as the branching ratios for @xmath199 , @xmath200 and @xmath201 , and the mass differences @xmath202 and @xmath203 . we require their theoretical predications to agree with the corresponding experimental values at @xmath187 level . + in the type - ii 2hdm and the l2hdm , only the charged higgs boson contributes to these observables by loops , so one can expect that @xmath198 versus @xmath54 is to be limited . combined analysis of the limits in the type - ii 2hdm has been done by the ckmfitter group , and the lower bound of @xmath204 as a function of @xmath87 was given in fig.11 of @xcite . this analysis indicates that @xmath198 must be heavier than @xmath205 at @xmath185 c.l . regardless the value of @xmath54 . in this work , we use the results of fig.11 in @xcite to exclude the unfavored points . as for the l2hdm , b physics actually can not put any constraints@xcite because in this model the couplings of the charged higgs boson to quarks are proportional to @xmath206 and in the case of large @xmath54 which we are interested in , they are suppressed . in our analysis of the l2hdm , we impose the lep bound on @xmath198 , i.e. @xmath207@xcite . * the constraints from the muon anomalous magnetic moment @xmath208 . now both the theoretical prediction and the experimental measured value of @xmath208 have reached a remarkable precision , but a significant deviation still exists : @xmath209 @xcite . in the 2hdm , @xmath208 gets additional contributions from the one - loop diagrams induced by the higgs bosons and also from the two - loop barr - zee diagrams mediated by @xmath0 and @xmath155@xcite . if the higgs bosons are much heavier than @xmath25 lepton mass , the contributions from the barr - zee diagrams are more important , and to efficiently alleviate the discrepancy of @xmath208 , one needs a light @xmath0 along with its enhanced couplings to @xmath25 lepton and also to heavy fermions such as bottom quark and @xmath171 lepton to push up the effects of the barr - zee diagram@xcite . the cp - even higgs bosons are usually preferred to be heavy since their contributions to @xmath208 are negative . + in the type - ii 2hdm , because @xmath54 is tightly constrained by the process @xmath210 at the lep@xcite and the @xmath178 decay@xcite , the barr - zee diagram contribution is insufficient to enhance @xmath208 to @xmath187 range around its measured value@xcite . so in our analysis , we require the type - ii 2hdm to explain @xmath208 at @xmath211 level . while for the l2hdm , @xmath54 is less constrained compared with the type - ii 2hdm , and the barr - zee diagram involving the @xmath171-loop is capable to push up greatly the theoretical prediction of @xmath208@xcite . therefore , we require the l2hdm to explain the discrepancy at @xmath187 level . + unlike the other constraints discussed above , the @xmath208 constraint will put a two - sided bound on @xmath54 since on the one hand , it needs a large @xmath54 to enhance the barr - zee contribution , but on the other hand , too large @xmath54 will result in an unacceptable large @xmath208 . * since this paper concentrates on a light @xmath0 , the decay @xmath212 is open up with a possible large decay width . we require the width of any higgs boson to be smaller than its mass to avoid a too fat higgs boson@xcite . we checked that for the scenario characterized by @xmath213 , the coefficient of @xmath214 interaction is usually larger than the electroweak scale @xmath125 , and consequently a large decay width is resulted . for the nmssm and nmssm , the above constraints become more complicated because in these models , not only more higgs bosons are involved in , but also sparticles enter the constraints . so it is not easy to understand some of the constraints intuitively . take the process @xmath199 as an example . in the supersymmetric models , besides the charged higgs contribution , chargino loops , gluino loops as well as neutralino loops also contribute to the process@xcite , and depending on the susy parameters , any of these contributions may become dominated over or be canceled by other contributions . as a result , although the charged higgs affects the process in the same way as that in the type - ii 2hdm , charged higgs as light as @xmath215 is still allowed even for @xmath216@xcite .    since among the constraints , @xmath208 is rather peculiar in that it needs new physics to explain the discrepancy between @xmath217 and @xmath218 , we discuss more about its dependence on susy parameters . in the nmssm and the nmssm , @xmath208 receives contributions from higgs loops and neutralino / chargino loops . for the higgs contribution , it is quite similar to that of the type - ii 2hdm except that more higgs bosons are involved in@xcite . for the neutralino / chargino contribution , in the light bino limit ( i.e. @xmath219 ) , it can be approximated by@xcite @xmath220 for @xmath221 with @xmath222 being smuon mass . so combining the two contributions together , one can learn that a light @xmath0 along with large @xmath54 and/or light smuon with moderate @xmath87 are favored to dilute the discrepancy .    because more parameters are involved in the constraints on the supersymmetric models , we consider following additional constraints to further limit their parameters :    * direct bounds on sparticle masses from the lep1 , the lep2 and the tevatron experiments @xcite . * the lep1 bound on invisible z decay @xmath223 ; the lep2 bound on neutralino production @xmath224 and @xmath225@xcite . * dark matter constraints from the wmap relic density 0.0975 @xmath226 0.1213 @xcite . note that among the above constraints , the constraint ( 2 ) on higgs sector and the constraint ( c ) on neutralino sector are very important . this is because in the supersymmetric models , the sm - like higgs is upper bounded by about @xmath227 at tree level and by about @xmath228 at loop level , and that the relic density restricts the lsp annihilation cross section in a certain narrow range .    in our analysis of the nmssm , we calculate the constraints ( 3 ) and ( 5 - 7 ) by ourselves and utilize the code nmssmtools @xcite to implement the rest constraints . we also extend nmssmtools to the nmssm to implement the constraints . for the extension , the most difficult thing we faced is how to adapt the code micromegas@xcite to the nmssm case . we solve this problem by noting the following facts :    * as we mentioned before , the nmssm is actually same as the nmssm with the trilinear singlet term setting to zero . so we can utilize the model file of the nmssm as the input of the micromegas and set @xmath229 . * since in the nmssm , the lsp is too light to annihilate into higgs pairs , there is no need to reconstruct the effective higgs potential to calculate precisely the annihilation channel @xmath230 with @xmath61 denoting any of higgs bosons@xcite . we thank the authors of the nmssmtools for helpful discussion on this issue when we finish such extension@xcite . with the above constraints , we perform four independent random scans over the parameter space of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively . we vary the parameters in following ranges : @xmath231 for the type - ii 2hdm , @xmath232 for the l2hdm , @xmath233 for the nmssm , and @xmath234 for the nmssm .    in performing the scans , we note that for the nmssm and the nmssm , some constraints also rely on the gaugino masses and the soft breaking parameters in the squark sector and the slepton sector . since these parameters affect little on the properties of @xmath0 , we fix them to reduce the number of free parameters in our scan . for the squark sector , we adopt the @xmath235 scenario which assumes that the soft mass parameters for the third generation squarks are degenerate : @xmath236 800 gev , and that the trilinear couplings of the third generation squarks are also degenerate , @xmath237 with @xmath238 . for the slepton sector , we assume all the soft - breaking masses and trilinear parameters to be 100 gev . this setting is necessary for the nmssm since this model is difficult to explain the muon anomalous moment at @xmath239 level for heavy sleptons@xcite . finally , we assume the grand unification relation @xmath240 for the gaugino masses with @xmath241 being fine structure constants of the different gauge group .    with large number of random points in the scans , we finally get about @xmath242 , @xmath243 , @xmath244 and @xmath242 samples for the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively which survive the constraints and satisfy @xmath245 . analyzing the properties of the @xmath0 indicates that for most of the surviving points in the nmssm and the nmssm , its dominant component is the singlet field ( numerically speaking , @xmath246 ) so that its couplings to the sm fermions are suppressed@xcite . our analysis also indicates that the main decay products of @xmath0 are @xmath247 for the l2hdm@xcite , @xmath248 ( dominant ) and @xmath247 ( subdominant ) for the type - ii 2hdm , the nmssm and the nmssm , and in some rare cases , neutralino pairs in the nmssm@xcite .    in fig.[fig4 ] , we project the surviving samples on the @xmath249 plane . this figure shows that the allowed range of @xmath54 is from @xmath250 to @xmath251 in the type - ii 2hdm , and from @xmath252 to @xmath253 in the l2hdm . just as we introduced before , the lower bounds of @xmath254 come from the fact that we require the models to explain the muon anomalous moment , while the upper bound is due to we have imposed the constraint from the lep process @xmath255 , which have limited the upper reach of the @xmath256 coupling for light @xmath61 @xcite(for the dependence of @xmath256 coupling on @xmath54 , see sec . this figure also indicates that for the nmssm and the nmssm , @xmath54 is upper bounded by @xmath257 . for the nmssm , this is because large @xmath87 can suppress the dark matter mass to make its annihilation difficult ( see @xcite and also sec . ii ) , but for the nmssm , this is because we choose a light slepton mass so that large @xmath54 can enhance @xmath208 too significantly to be experimentally unacceptable . we checked that for the slepton mass as heavy as @xmath258 , @xmath259 is still allowed for the nmssm .    in fig.[fig5 ] and fig.[fig6 ] , we show the branching ratios of @xmath260 and @xmath261 respectively . fig.[fig5 ] indicates , among the four models , the type - ii 2hdm predicts the largest ratio for @xmath260 with its value varying from @xmath262 to @xmath263 . the underlying reason is in the type - ii 2hdm , the @xmath264 coupling is enhanced by @xmath54 ( see fig.[fig4 ] ) , while in the other three model , the coupling is suppressed either by @xmath265 or by the singlet component of the @xmath0 . fig.[fig6 ] shows that the l2hdm predicts the largest rate for @xmath266 with its value reaching @xmath5 in optimum case , and for the other three models , the ratio of @xmath261 is at least about one order smaller than that of @xmath267 . this feature can be easily understood from the @xmath268 coupling introduced in sect . we emphasize that , if the nature prefers a light @xmath0 , @xmath260 and/or @xmath269 in the type - ii 2hdm and the l2hdm will be observable at the gigaz . then by the rates of the two decays , one can determine whether the type - ii 2hdm or the l2hdm is the right theory . on the other hand , if both decays are observed with small rates or fail to be observed , the singlet extensions of the mssm are favored .    in fig.[fig7 ] , we show the rate of @xmath3 as the function of @xmath270 . this figure indicates that the branching ratio of @xmath121 can reach @xmath271 , @xmath272 , @xmath273 and @xmath274 for the optimal cases of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively , which implies that the decay @xmath121 will never be observable at the gigaz if the studied model is chosen by nature . the reason for the smallness is , as we pointed out before , that the decay @xmath121 proceeds only at loop level . comparing the optimum cases of the type - ii 2hdm , the nmssm and the nmssm shown in fig.5 - 7 , one may find that the relation @xmath275 holds for any of the decays . this is because the decays are all induced by the yukawa couplings with similar structure for the models . in the supersymmetric models , the large singlet component of the light @xmath0 is to suppress the yukawa couplings , and the @xmath0 in the nmssm has more singlet component than that in the nmssm . next we consider the decay @xmath11 , which , unlike the above decays , depends on the higgs self interactions . in fig.[fig8 ] we plot its rate as a function of @xmath270 and this figure indicates that the @xmath276 may be the largest among the ratios of the exotic @xmath1 decays , reaching @xmath277 in the optimum cases of the type - ii 2hdm , the l2hdm and the nmssm . the underlying reason is , in some cases , the intermediate state @xmath119 in fig.[fig3 ] ( a ) may be on - shell . in fact , we find this is one of the main differences between the nmssm and the nmssm , that is , in the nmssm , @xmath119 in fig.[fig3 ] ( a ) may be on - shell ( corresponds to the points with large @xmath278 ) while in the nmssm , this seems impossible . so we conclude that the decay @xmath11 may serve as an alternative channel to test new physics models , especially it may be used to distinguish the nmssm from the nmssm if the supersymmetry is found at the lhc and the @xmath11 is observed at the gigaz with large rate . before we end our discussion , we note that in the nmssm , the higgs boson @xmath0 may be lighter than @xmath279 without conflicting with low energy data from @xmath178 decays and the other observables ( see fig.[fig4]-[fig8 ] ) . in this case , @xmath0 is axion - like as pointed out in @xcite . we checked that , among the rare @xmath1 decays discussed in this paper , the largest branching ratio comes from @xmath280 which can reach @xmath281 . since in this case , the decay product of @xmath0 is highly collinear muon pair , detecting the decay @xmath280 may need some knowledge about detectors , which is beyond our discussion . in this paper , we studied the rare @xmath1-decays @xmath2 ( @xmath7 ) , @xmath282 and @xmath4 in the type - ii 2hdm , lepton - specific 2hdm , nmssm and nmssm , which predict a light cp - odd higgs boson @xmath0 . in the parameter space allowed by current experiments , the branching ratio can be as large as @xmath5 for @xmath118 , @xmath8 for @xmath3 and @xmath9 for @xmath4 , which implies that the decays @xmath2 and @xmath283 may be accessible at the gigaz option . since different models predict different size of branching ratios , these decays can be used to distinguish different model through the measurement of these rare decays . this work was supported in part by hastit under grant no . 2009hastit004 , by the national natural science foundation of china ( nnsfc ) under grant nos . 10821504 , 10725526 , 10635030 , 10775039 , 11075045 and by the project of knowledge innovation program ( pkip ) of chinese academy of sciences under grant no . .        for some reviews , see , e.g. , m.  a.  perez , g.  tavares - velasco and j.  j.  toscano , int . j.  mod . a * 19 * , 159 ( 2004 ) ; j. m. yang , arxiv:1006.2594 . j.  i.  illana , m.  masip , 67 , 035004 ( 2003 ) ; j. cao , z. xiong , j. m. yang , 32 , 245 ( 2004 ) . d. atwood _ et al_. , 66 , 093005 ( 2002 ) . j. kalinowski , and s. pokorski , 219 , 116 ( 1989 ) ; a. djouadi , p. m. zerwas and j. zunft , 259 , 175 ( 1991 ) ; a. djouadi , j. kalinowski , and p. m. zerwas , z. phys . c * 54 * , 255 ( 1992 ) . m. krawczyk , _ et al . _ , 19 , 463 ( 2001 ) ; 8 , 495 ( 1999 ) . j. f. gunion , g. gamberini and s. f. novaes , 38 , 3481 ( 1988 ) ; thomas j. weiler and tzu - chiang yuan , 318 , 337 ( 1989 ) ; a. djouadi , _ et al . _ , 1 , 163 ( 1998)[hep - ph/9701342 ] . d.  chang and w.  y.  keung , phys . lett .  * 77 * , 3732 ( 1996 ) . e.  keith and e.  ma , 57 , 2017 ( 1998 ) ; m.  a.  perez , g.  tavares - velasco and j.  j. toscano , int . j.  mod.phys . a * 19 * , 159 ( 2004 ) . f.  larios , g.  tavares - velasco and c. p.  yuan , 64 , 055004 ( 2001 ) ; 66 , 075006 ( 2002 ) . a. djouadi , _ et al . _ , 10 , 27 ( 1999 ) [ hep - ph/9903229 ] . for a detailed introduction of the nmssm , see f.  franke and h. fraas , int . j.  mod . a * 12 * ( 1997 ) 479 ; for a recent review of the nmssm , see for example , u. ellwanger , c. hugonie , and a. m. teixeira , arxiv : 0910.1785 . see , e.g. , j.  r.  ellis , j.  f.  gunion , h.  e.  haber , l.  roszkowski and f.  zwirner , phys .  rev . d * 39 * ( 1989 ) 844 ; m.  drees , int . j.  mod . phys .  a * 4 * ( 1989 ) 3635 ; u.  ellwanger , m.  rausch de traubenberg and c.  a.  savoy , phys . b * 315 * ( 1993 ) 331 ; nucl . b * 492 * ( 1997 ) 21 ; d.j . miller , r. nevzorov , p.m. zerwas , 681 , 3 ( 2004 ) .    c.  panagiotakopoulos , k.  tamvakis , 446 , 224 ( 1999 ) ; 469 , 145 ( 1999 ) ; c. panagiotakopoulos , a. pilaftsis , 63 , 055003 ( 2001 ) ; a.  dedes , _ et al . _ , 63 , 055009 ( 2001 ) ; a.  menon , _ et al . _ , 70 , 035005 ( 2004 ) ; v.  barger , _ et al . _ , 630 , 85 ( 2005 ) . c.  balazs , _ et al . _ , 0706 , 066 ( 2007 ) . b. a. dobrescu , k. t. matchev , 0009 , 031 ( 2000 ) ; a. arhrib , k. cheung , t. j. hou , k. w. song , hep - ph/0611211 ; 0703 , 073 ( 2007 ) ; x. g. he , j. tandean , and g. valencia , 98 , 081802 ( 2007 ) ; 0806 , 002 ( 2008 ) ; f. domingo _ et al_. , 0901 , 061 ( 2009 ) ; gudrun hiller , 70 , 034018 ( 2004 ) ; r. dermisek , and john f. gunion , 75 , 075019 ( 2007 ) ; 79 , 055014 ( 2009 ) ; 81 , 055001 ( 2010 ) ; r. dermisek , john f. gunion , and b. mcelrath , 76 , 051105 ( 2007 ) ; z. heng , _ et al_. , 77 , 095012 ( 2008 ) ; a. belyaev _ et al_. , 81 , 075021 ( 2010 ) ; d. das and u.  ellwanger , arxiv:1007.1151 [ hep - ph ] . s.  andreas , o.  lebedev , s.  ramos - sanchez and a.  ringwald , arxiv:1005.3978 [ hep - ph ] . j.  f.  gunion , jhep * 0908 * , 032 ( 2009 ) ; r. dermisek and j.  f.  gunion , phys .  rev . d * 81 * , 075003 ( 2010 ) . r.  dermisek and j.  f. gunion , phys . lett .   * 95 * , 041801 ( 2005 ) ; phys . d * 73 * , 111701 ( 2006 ) . j. cao , h. e. logan , j. m. yang , 79 , 091701 ( 2009 ) . j. cao , p. wan , l. wu , j. m. yang , 80 , 071701 ( 2009 ) . j. f. gunion and h. e. haber , 67 , 075019 ( 2003 ) . r.  m.  barnett , _ et al . _ , phys . b * 136 * , 191 ( 1984 ) ; r.  m.  barnett , g.  senjanovic and d.  wyler , phys . d * 30 * , 1529 ( 1984 ) ; y.  grossman , nucl . b * 426 * , 355 ( 1994 ) . h.  s.  goh , l.  j.  hall and p. kumar , jhep * 0905 * , 097 ( 2009 ) ; a.  g. akeroyd and w.  j.  stirling , nucl . b * 447 * , 3 ( 1995 ) ; a.  g.  akeroyd , phys . b * 377 * , 95 ( 1996 ) ; h.  e.  logan and d.  maclennan , phys .  rev . d * 79 * , 115022 ( 2009 ) ; m. aoki , _ et al . _ , arxiv:0902.4665 [ hep - ph ] . v.  barger , p.  langacker , h.  s.  lee and g. shaughnessy , phys . d * 73 * , 115010 ( 2006 ) . s. hesselbach , _ et . _ , arxiv:0810.0511v2 [ hep - ph ] . de vivie and p.  janot [ aleph collaboration ] , pa13 - 027 contribution to the international conference on high energy physics , warsaw , poland , 2531 july 1996 ; j. kurowska , o.  grajek and p.  zalewski [ delphi collaboration ] , cern - open-99 - 385 . [ aleph collaboration and delphi collaboration and l3 collaboration ] , phys . rept .   * 427 * , 257 ( 2006 ) . j.  cao and j.  m.  yang , jhep * 0812 * , 006 ( 2008 ) . m.  krawczyk and d.  temes , eur . j.   c * 44 * , 435 ( 2005 ) . g.  altarelli and r.  barbieri , 253 , 161 ( 1991 ) ; m. e. peskin , t. takeuchi , 46 , 381 ( 1992 ) . c. amsler , _ et al . _ , ( particle data group ) , 667 , 1 ( 2008 ) . o. deschamps , s.  descotes - genon , s.  monteil , v.  niess , s.  tjampens and v.  tisserand , arxiv:0907.5135 [ hep - ph ] . s.  su and b. thomas , phys . d * 79 * , 095014 ( 2009 ) . g. abbiendi , _ et al . _ , eur .  phys . j.   c * 32 * , 453 ( 2004 ) . m.  davier , _ et al . _ , 66 , 1 ( 2010 ) . k.  cheung , _ et al . _ , phys . d * 64 * , 111301 ( 2001 ) . k.  cheung and o.  c.  w. kong , phys . d * 68 * , 053003 ( 2003 ) . t. besmer , c. greub , t.hurth , 609 , 359 ( 2001 ) ; f. borzumati , _ et al . _ , 62 , 075005(2000 ) . j.  cao , k.  i.  hikasa , w.  wang , j.  m.  yang and l.  x.  yu , phys . d * 82 * , 051701 ( 2010 ) [ arxiv:1006.4811 [ hep - ph ] ] . j.  f.  gunion , _ et . d * 73 * , 015011 ( 2006 ) . martin and j.  d.  wells , phys . d * 64 * , 035003 ( 2001 ) . j.  abdallah _ et al . _ , eur . j.   c * 31 * , 421 ( 2004 ) ; g.  abbiendi _ et al . _ , eur . j. c * 35 * , 1 ( 2004 ) . j.  dunkley _ et al . _ [ wmap collaboration ] , astrophys . j.  suppl . * 180 * , 306 ( 2009 ) [ arxiv:0803.0586 [ astro - ph ] ] . u. ellwanger _ et al . _ , 02 , 066 ( 2005 ) . g.  belanger , f.  boudjema , a.  pukhov and a.  semenov , comput . commun .   * 174 * , 577 ( 2006 ) ; comput . phys .  commun . * 176 * , 367 ( 2007 ) . g.  belanger , f.  boudjema , c. hugonie , a.  pukhov and a.  semenov , jcap * 0509 * , 001 ( 2005 ) ."""
+
+        ARTICLE_MAGNET = r"""it is well known that the classical magnetoresistance ( mr ) in metals or semiconductors with a closed free electron fermi surface increases quadratically with increasing magnetic field @xmath2 for @xmath3 and saturates when @xmath4 . here @xmath5 is the zero - magnetic - field mobility . hence , the extraordinarily high and linear mr ( lmr ) , which breaks this familiar rule , has been gaining much attention as soon as its discovery . in the past decade , this unexpected lmr has been reported in silver chalcogenide,@xcite indium antimonide,@xcite silicon,@xcite mnas - gaas composite material,@xcite and graphene.@xcite    kapitza s linear law@xcite indicates that the metal shows a magnetoresistance linear in perpendicular magnetic field when it has an open fermi surface and a mean free path longer than the electronic larmor radius . recently , another two models , irrespective of the open fermi surface , have been constructed to provide possible mechanisms for the lmr phenomenon . abrikosov suggested a quantum - limit origin of lmr for the homogenous system with a gapless linear energy spectrum.@xcite his model requires that landau levels are well formed and the carrier concentration is small that all electrons occupy only the lowest landau band . alternatively , parish and littlewood developed a classical model without involving linear spectrum.@xcite ignoring the concrete microscopic mechanism , they attributed this unusual mr to the mobility fluctuations in a strongly inhomogenous system . topological insulators@xcite ( tis ) are novel materials with a full energy gap in bulk , while there are gapless surface states . due to its unique band structure with only one helical dirac cone and linear energy dispersion,@xcite the surface states of the ti bi@xmath0se@xmath1 become an excellent platform for the study of quantum - limit lmr . the recent experiment in this flat surface system , however , reported that a large positive mr , which becomes very linear above a characteristic field of @xmath6@xmath7@xmath8 t , was observed even in an opposite situation where the carrier sheet density is high that electrons occupy more than one landau levels.@xcite moreover , they found that raising temperature to room temperature almost has no influence on the observed lmr . it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model . so far a reliable theoretical scheme capable of explaining this novel experiment has still been lacking .    in this paper , we generalize the balance - equation approach@xcite to a system modeling the surface states of a three - dimensional ti to investigate the two - dimensional magnetotransport in it . we find that a positive , nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the ti surface state having a positive and finite effective g - factor . this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels , and persists up to room temperature , providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite we consider the surface state of a bi@xmath0se@xmath1-type large bulk gap ti in the @xmath9-@xmath10 plane under the influence of a uniform magnetic field @xmath11 applied along the @xmath12 direction.@xcite following the experimental observation,@xcite we assume that the fermi energy locates in the gap of the bulk band and above the dirac point , i.e. the surface carriers are electrons . further , the separations of the fermi energy from the bottom of bulk band and dirac point are much larger than the highest temperature ( @xmath13 ) considered in this work . hence , the contribution from the bulk band to the magnetotransport is negligible . these electrons , scattered by randomly distributed impurities and by phonons , are driven by a uniform in - plane electric field @xmath14 in the topological surface . the hamiltonian of this many - electron and phonon system consists of an electron part @xmath15 , a phonon part @xmath16 , and electron - impurity and electron - phonon interactions @xmath17 and @xmath18 : @xmath19 here , the electron hamiltonian is taken in the form @xmath20 , \ ] ] in which @xmath21 , @xmath22 , @xmath23 and @xmath24 , stand , respectively , for the canonical momentum , coordinate , momentum and spin operators of the @xmath25th electron having charge @xmath26 , @xmath27 is the vector potential of the perpendicular magnetic field @xmath28 in the landau gauge , @xmath29 is the fermi velocity , @xmath30 is the effective g - factor of the surface electron , and @xmath31 is the bohr magneton with @xmath32 the free electron mass . the sum index @xmath25 in eq.([helectron ] ) goes over all electrons of total number @xmath33 in the surface state of unit area .    in the frame work of balance equation approach,@xcite the two - dimensional center - of - mass ( c.m . ) momentum and coordinate @xmath34 and @xmath35 , and the relative - electron momenta and coordinates @xmath36 and @xmath37 are introduced to write the hamiltonian @xmath15 into the sum of a single - particle c.m . part @xmath38 and a many - particle relative - electron part @xmath39 : @xmath40 , with @xmath41.\end{aligned}\ ] ] in this , @xmath42 is the canonical momentum of the center - of - mass and @xmath43 is the canonical momentum for the @xmath25th relative electron . here we have also introduced c.m . spin operators @xmath44 and @xmath45 . the commutation relations between the c.m . spin operators @xmath46 and @xmath47 and the spin operators @xmath48 , @xmath49 and @xmath50 of the @xmath25th electron are of order of @xmath51 : @xmath52= n^{-1}2\,{\rm i}\,\varepsi lon_{\beta_1\beta_2\beta_3}\sigma_j^{\beta_3}$ ] with @xmath53 . therefore , for a macroscopic large @xmath33 system , the c.m . part @xmath38 actually commutes with the relative - electron part @xmath54 in the hamiltonian , i.e. the c.m . motion and the relative motion of electrons are truly separated from each other . the couplings between the two emerge only through the electron impurity and electron  phonon interactions . furthermore , the electric field @xmath55 shows up only in @xmath38 . and , in view of @xmath56={\rm i}\delta_{\alpha \beta}(\delta_{ij}-1/n)\simeq { \rm i}\delta_{\alpha\beta}\delta_{ij}$ ] , i.e. the relative - electron momenta and coordinates can be treated as canonical conjugate variables , the relative - motion part @xmath54 is just the hamiltonian of @xmath33 electrons in the surface state of ti in the magnetic field without the presence of the electric field .    in terms of the c.m . coordinate @xmath57 and the relative electron density operator @xmath58 , the electron impurity and electron  phonon interactions can be written as@xcite @xmath59 here @xmath60 and @xmath61 are respectively the impurity potential ( an impurity at randomly distributed position @xmath62 ) and electron  phonon coupling matrix element in the plane - wave representation , and @xmath63 with @xmath64 and @xmath65 being the creation and annihilation operators for a phonon of wavevector @xmath66 in branch @xmath67 having frequency @xmath68 . velocity ( operator ) @xmath69 is the time variation of its coordinate : @xmath70= v_{\rm f}(\sigma_{\rm c}^y\ , \hat{i}-\sigma_{\rm c}^x\ , \hat{j})$ ] . to derive a force - balance equation for steady state transport we consider the heisenberg equation for the rate of change of the c.m . canonical momentum @xmath71 : @xmath72= - n e({\bm v}\times { \bm b})- n e{\bm e}+{\bm { f}}_{\rm i}+{\bm { f}}_{\rm p},\ ] ] in which the frictional forces @xmath73 and @xmath74 share the same expressions as given in ref ..    the statistical average of the operator equation can be determined to linear order in the electron  impurity and electron phonon interactions @xmath17 and @xmath18 with the initial density matrix @xmath75 at temperature @xmath76 when the in - plane electric field @xmath77 is not strong . for steady - transport states we have @xmath78 , leading to a force - balance equation of the form @xmath79 here @xmath80 , the statistically averaged velocity of the moving center - of - mass , is identified as the average rate of change of its position , i.e. the drift velocity of the electron system driven by the electric field @xmath77 , and @xmath81 and @xmath82 are frictional forces experienced by the center - of - mass due to impurity and phonon scatterings : @xmath83,\label{fp}\end{aligned}\ ] ] in which @xmath84 is the bose distribution function , @xmath85 , and @xmath86 stands for the imaginary part of the fourier spectrum of the relative - electron density correlation function defined by @xmath87\big\rangle_{0},\ ] ] where @xmath88 and @xmath89 denotes the statistical averaging over the initial density matrix @xmath90.@xcite    the force - balance equation describes the steady - state two - dimensional magnetotransport in the surface state of a ti . note that the frictional forces @xmath81 and @xmath82 are in the opposite direction of the drift velocity @xmath91 and their magnitudes are functions of @xmath92 only . with the drift velocity @xmath93 in the @xmath9 direction , the force - balance equation eq . yields a transverse resistivity @xmath94 , and a longitudinal resistivity @xmath95 . the linear one is in the form @xmath96 for calculating the electron density correlation function @xmath97 we proceed in the landau representation.@xcite the landau levels of the single - particle hamiltonian @xmath98 of the relative - electron system in the absence of electric field are composed of a positive `` @xmath99 '' and a negative `` @xmath100 '' branch@xcite @xmath101 with @xmath102 and @xmath103 , and a zero ( @xmath104 ) level @xmath105 the corresponding landau wave functions are @xmath106 and @xmath107 for @xmath108 ; and @xmath109 for @xmath104 . here @xmath110 is the wavevector of the system along @xmath9 direction ; @xmath111 with @xmath112 ; and @xmath113 is the harmonic oscillator eigenfunction with @xmath114 being the hermite polynomial , @xmath115 , and @xmath116 . each landau level contains @xmath117 electron states for system of unit surface area . the positive branch @xmath118 and the @xmath104 level @xmath119 of the above energy spectra are indeed quite close to those of the surface states in the bulk gap of bi@xmath0se@xmath1-family materials derived from microscopic band calculation.@xcite    the landau levels are broadened due to impurity , phonon and electron - electron scatterings . we model the imaginary part of the retarded green s function , or the density - of - states , of the broadened landau level @xmath120 ( written for `` + ' ' -branch and @xmath104 levels ) , using a gaussian - type form:@xcite @xmath121,\ ] ] with a half - width @xmath122 of the form:@xcite @xmath123^{1/2}$ ] . here @xmath124 is the single - particle lifetime and @xmath125 is the cyclotron frequency of linear - energy - dispersion system with @xmath126 being the zero - temperature fermi level . using a semi - empirical parameter @xmath127 to relate @xmath124 with the transport scattering time @xmath128 , and expressing @xmath129 with the zero - field mobility @xmath5 at finite temperature,@xcite we can write the landau - level broadening as @xmath130^{1/2}.\ ] ]    in the present study we consider the case of @xmath120-doping , i.e. the fermi level is high enough above the energy zero of the dirac cone in the range of `` + ' ' -branch levels and the states of `` @xmath100''-branch levels are completely filled , that they are irrelevant to electron transport . special attention has to be paid to the @xmath104 level , since , depending on the direction of exchange potential the effective g - factor of a ti surface state , @xmath30 , can be positive , zero or negative.@xcite the sign and magnitude of the effective g - factor determines how many states of the zero level should be included in or excluded from the available states for electron occupation in the case of @xmath120-doping at a magnetic field . ( i ) if @xmath131 , the @xmath104 level center is exactly at @xmath132 and the system is electron - hole symmetric . the total number of negative energy states ( including the states of the lower half of the @xmath104 level and states of the @xmath100"-branch levels ) and that of positive energy states ( including the states of the upper half of the @xmath104 level and states of the @xmath99"-branch levels ) do not change when changing magnetic field . therefore , the lower - half negative energy states of this level are always filled and the upper - half positive - energy states of it are available for the occupation of particles which are counted as electrons participating in transport in the case of @xmath120-doping . ( ii ) for a finite positive @xmath133 , the @xmath104 level @xmath134 moves downward to negative energy and its distance to the nearest  @xmath100"-branch level is @xmath135 closer than to the nearest  + " -branch level at finite magnetic field strength @xmath2 . this is equivalent to the opening of an increasingly enlarged ( with increasing @xmath2 ) energy gap between the  + " -branch states and the states of the zero - level and the  @xmath100"-branch levels . the opening of a sufficient energy gap implies that with increasing magnetic field the states in the  + " -branch levels would no longer shrink into the zero - level , and thus the @xmath104 level should be completely excluded from the conduction band , i.e. only particles occupying the  + " -branch states are counted as electrons participating in transport in the case of @xmath120-doping , when the magnetic field @xmath2 gets larger than a certain value ( depending on the magnitude of @xmath30 ) . ( iii ) for a finite negative @xmath136 , the @xmath104 level @xmath134 moves upward to positive energy and an increasingly enlarged energy gap will be opened between the states of the zero - level and the  + " -branch and the states of  @xmath100"-branch levels , and particles occupying the @xmath104 level and  + " -branch states are electrons participating in transport when the magnetic field @xmath2 gets larger than a certain value .    as a result , the experimentally accessible sheet density @xmath33 of electrons participating in transport is related to the fermi energy @xmath137 by the following equation valid at finite @xmath30 for the magnetic field @xmath2 larger than a certain value : @xmath138 in which @xmath139 + 1\}^{-1}$ ] is the fermi distribution function at temperature @xmath76 and the summation index @xmath120 goes over @xmath140 for @xmath133 , or @xmath141 for @xmath136 . in the case of @xmath131 , @xmath142\ ] ] valid for arbitrary magnetic field , in which @xmath143 . the imaginary part of relative - electron density correlation function in the presence of a magnetic field , @xmath86 , can be expressed in the landau representation as@xcite @xmath144 in which the transform factor @xmath145 ^ 2,\end{aligned}\ ] ] with @xmath146 , @xmath147 , @xmath148 , and @xmath149 being associated laguerre polynomials . the landau - representation correlation function @xmath150 in eq.([piqw ] ) can be constructed with the imaginary part of the retarded green s function @xmath151 , or the density - of - states , of the @xmath120th landau level as@xcite @xmath152\nonumber\\ & \hspace{1.2cm}\times{\rm im}g_n(\epsilon+\omega){\rm im}g_{n'}(\epsilon).\end{aligned}\ ] ] the summation indices @xmath120 and @xmath153 in eq.([piqw ] ) are taken over @xmath140 for @xmath133 , or @xmath154 for @xmath136 . in the case of @xmath131 , eq.([piqw ] ) still works and the summation indices @xmath120 and @xmath153 go over @xmath154 but with @xmath155 replaced by @xmath156 in eq.([p2nn ] ) . numerical calculations are performed for the magnetoresistivity @xmath157 of surface state in a uniform ti bi@xmath0se@xmath1 . at zero temperature the elastic scattering contributing to the resistivity is modeled by a coulomb potential due to charged impurities:@xcite @xmath158 with @xmath159 being the impurity density , which is determined by the zero - magnetic - field mobility @xmath5 . at temperatures higher than @xmath160,@xcite phonon scatterings play increasingly important role and the dominant inelastic contribution comes from optical phonons . for this polar material , the scattering by optical phonons via the deformation potential can be neglected . hence , we take account of inelastic scattering from optical phonons via frhlich coupling : @xmath161 . in the numerical calculation we use the following parameters:@xcite fermi velocity @xmath162 , static dielectric constant @xmath163 , optical dielectric constant @xmath164 , and phonon energy @xmath165 . the broadening parameter is taken to be @xmath166 . as a function of the magnetic field @xmath2 having different effective g - factors : @xmath167 and @xmath168 for a ti surface system with electron sheet density @xmath169 in the cases of zero - magnetic - field mobility @xmath170 ( a ) and @xmath171 ( b ) . several integer - number positions of filling factor @xmath172 are marked in ( b).,scaledwidth=40.0% ]    fig.[diffg ] shows the calculated magnetoresistivity @xmath157 versus the magnetic field strength @xmath2 for a ti surface system with electron sheet density @xmath169 but having different effective g - factors : @xmath167 and @xmath168 for two values of zero - magnetic - field mobility @xmath170 and @xmath171 , representing different degree of landau - level broadening . in the case without zeeman splitting ( @xmath131 ) the resistivity @xmath157 exhibits almost no change with changing magnetic field up to 10 t , except the shubnikov - de haas ( sdh ) oscillation showing up in the case of @xmath171 . this kind of magnetoresistance behavior was indeed seen experimentally in the electron - hole symmetrical massless system of single - layer graphene.@xcite in the case of a positive g - factor , @xmath173 , the magnetoresistivity increases linearly with increasing magnetic field ; while for a negative g - factor , @xmath174 , the magnetoresistivity decreases linearly with increasing magnetic field . is shown as a function of the magnetic field @xmath2 for different values of zero - magnetic - field mobility : ( a ) @xmath175 , ( b ) @xmath176 , ( c ) @xmath177 , ( d ) @xmath178 , ( e ) @xmath179 , and ( f ) @xmath180 . the inset of ( a ) illustrates the same for a larger magnetic - field range @xmath181 . the filling factor @xmath182 is plotted versus the magnetic field in ( f ) ; and several integer - number positions of @xmath182 are also marked in ( d ) and ( e ) . here the surface electron density @xmath169 and the lattice temperature @xmath183.,scaledwidth=47.0% ]    in the following we will give more detailed examination on the linearly increasing magnetoresistance in the positive @xmath30 case . fig.[rhob ] shows the calculated resistivity @xmath157 versus the magnetic field strength @xmath2 at lattice temperature @xmath183 for system of carrier sheet density @xmath169 and @xmath173 , having different zero - field mobility @xmath184 and @xmath180 . all resistivity curves for mobility @xmath185 exhibit clear linearity in the magnetic - field range and appear no tendency of saturation at the highest field shown in the figure . especially , for the case @xmath170 , the linear behavior extends even up to the magnetic field of @xmath186 , as illustrated in the inset of fig.[rhob](a ) . this feature contradicts the classical mr which saturates at sufficiently large magnetic field @xmath187 . note that here we only present the calculated @xmath157 for magnetic field @xmath2 larger than @xmath188 t , for which a sufficient energy gap @xmath135 is assumed to open that with further increase of the magnetic field the states in the `` + ' ' -branch levels no longer shrink into the zero level and thus it should be excluded from the conduction band . this is of course not true for very weak magnetic field . when @xmath189 the energy gap @xmath190 , the situation becomes similar to the case of @xmath131 : the whole upper half of the zero - level states are available to electron occupation and we should have a flat resistivity @xmath157 when changing magnetic field . with increasing @xmath2 the portion of the zero - level states available to conduction electrons decreases until the magnetic field reaches @xmath191 . as a result the resistivity @xmath157 should exhibit a crossover from a flat changing at small @xmath2 to positively linear increasing at @xmath192 . this is just the behavior observed in the ti bi@xmath0se@xmath1.@xcite    note that in the case of @xmath170 , the broadened landau - level widths are always larger than the neighboring level interval : @xmath193 , which requires @xmath194 ^ 2 $ ] , even for the lowest landau level @xmath195 , i.e. the whole landau - level spectrum is smeared . with increasing the zero - field mobility the magnitude of resistivity @xmath157 decreases , and when the broadened landau - level width becomes smaller than the neighboring level interval , @xmath196 , a weak sdh oscillation begin to occur around the linearly - dependent average value of @xmath157 at higher portion of the magnetic field range , as seen in fig.[rhob](c ) , ( d ) and ( e ) for @xmath197 and @xmath198 . on the other hand , in the case of large mobility , e.g. @xmath199 , where the broadened landau - level widths @xmath200 are much smaller than the neighboring level interval even for level index @xmath120 as large as @xmath201 , the magnetoresistivity shows pronounced sdh oscillation and the linear - dependent behavior disappears , before the appearance of quantum hall effect,@xcite as shown in fig.[rhob](f ) . abrikosov s model for the lmr requires the applied magnetic field large enough to reach the quantum limit at which all the carriers are within the lowest landau level,@xcite while it is obvious that more than one landau levels are occupied in the experimental samples in the field range in which the linear and non - saturating magnetoresistivity was observed.@xcite for the given electron surface density @xmath202 , the number of occupied landau levels , or the filling factor @xmath172 , at different magnetic fields is shown in fig.[rhob](f ) , as well as in the fig.[rhob](d ) and ( e ) , where the integer - number positions of @xmath203 , i.e. filling up to entire @xmath182 landau levels , coincide with the minima of the density - of - states or the dips of sdh oscillation . this is in contrast with @xmath131 case , where the integer number of @xmath203 , which implies a filling up to the center position of the @xmath182th landau levels , locates at a peak of sdh oscillation , as shown in fig.[diffg]b . the observed sdh oscillations in the bi@xmath0se@xmath1 nanoribbon exhibiting nonsaturating surface lmr in the experiment@xcite favor the former case : a finite positive effective @xmath133 .     is plotted as a function of the surface electron density @xmath33 at magnetic field @xmath204 : ( a ) at different values of zero - field mobility @xmath5 , and ( b ) at different values of zero - field conductivity @xmath205.,scaledwidth=40.0% ]     at various lattice temperatures . here the zero - magnetic - field mobility at zero temperature is @xmath206.,scaledwidth=35.0% ]    next , we examine the density - dependence of the linear magnetoresistivity . to compare with abrikosov s quantum magnetoresistance which suggests a @xmath207 behavior,@xcite we show the calculated @xmath208 for above lmr versus the carrier sheet density @xmath33 in fig.[rhon ] at fixed magnetic field @xmath209 t . the mobility is taken respectively to be @xmath210 and @xmath211m@xmath212/vs to make the resistivity in the lmr regime . a clearly linear dependence of @xmath213 on the surface density @xmath33 is seen in all cases , indicating that this non - saturating linear resistivity is almost inversely proportional to the carrier density . in the figure we also show @xmath208 versus @xmath33 under the condition of different given conductivity @xmath214 and @xmath215 . in this case the half - width @xmath216 is independent of surface density . the linear dependence still holds , indicating that this linear behavior is not sensitive to the modest @xmath33-dependence of landau level broadening @xmath216 as long as the system is in the overlapped landau level regime . from the above discussion , it is obvious that lmr shows up in the system having overlapped landau levels and the separation of landau levels makes the mr departure from the linear increase . at high temperature , the thermal energy would smear the level separation and phonon scatterings further broaden landau levels . hence , it is believed that this lmr will be robust against raising temperature . this is indeed the case as seen in fig.[rhot ] , where we plot the calculated magnetoresistivity @xmath157 for the above system with zero - temperature linear mobility @xmath217m@xmath212/vs versus the magnetic field at different lattice temperatures . we can see that raising temperature to room temperature has little effect on the linearity of mr . due to the decreased mobility at higher temperature from phonon scattering , the weak sdh oscillation on the linear background tends to vanish . these features are in good agreement with the experimental report.@xcite in summary , we have studied the two - dimensional magnetotransport in the flat surface of a three - dimensional ti , which arises from the surface states with a wavevector - linear energy dispersion and a finite , positive zeeman splitting within the bulk energy gap . when the level broadening is comparable to or larger than the landau - level separation and the conduction electrons spread over many landau levels , a positive , dominantly linear and non - saturating magnetoresistance appears within a quite wide range of magnetic field and persists up to room temperature . this remarkable lmr provides a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite    in contrast to quantum hall effect which appears in the case of well formed landau levels and to abrikosov s quantum magnetotransport,@xcite which is limited to the extreme quantum limit that all electrons coalesce into the lowest landau level , the discussed lmr is a phenomena of pure classical two - dimensional magnetotransport in a system having linear - energy - dispersion , appearing in the regime of overlapped landau levels , irrespective of its showing up in relatively high magnetic field range . furthermore , the present scheme deals with spatially uniform case without invoking the mobility fluctuation in a strongly inhomogeneous system , which is required in the classical parish and littlewood model to produce a lmr.@xcite    the appearance of this significant positive - increasing linear magnetoresistance depends on the existence of a positive and sizable effective g - factor . if the zeeman energy splitting is quite small the resistivity @xmath157 would exhibit little change with changing magnetic field . in the case of a negative and sizable effective g - factor the magnetoresistivity would decrease linearly with increasing magnetic field . therefore , the behavior of the longitudinal resistivity versus magnetic field may provide a useful way for judging the direction and the size of the effective zeeman energy splitting in ti surface states . this work was supported by the national science foundation of china ( grant no . 11104002 ) , the national basic research program of china ( grant no . 2012cb927403 ) and by the program for science&technology innovation talents in universities of henan province ( grant no . 2012hastit029 ) ."""
+
+        inputs = tokenizer(
+            [ARTICLE_LEP, ARTICLE_MAGNET],
+            max_length=1024,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        inputs = {k: inputs[k].to(torch_device) for k in inputs}
+
+        hypotheses_batch = model.generate(**inputs)
+
+        EXPECTED_LEP = "motivated by some recent studies on the light cp - odd higgs boson @xmath0 in non - minimal supersymmetric models, we investigate the rare @xmath1-decays @xmath2 ( @xmath3 ) in the two higgs doublet model ( 2hdm ), the nearly minimal supersymmetric standard model ( nmssm ), the next - to - minimal supersymmetric standard model ( nmssm ) and the minimal supersymmetric standard model ( mssm ).<n> we find that the branching ratios of @xmath4 can reach @xmath5 in 2hdm, @xmath6 in nmssm and @xmath7 in mssm, which are at the level of @xmath8 in 2hdm, @xmath9 in nmssm and @xmath10 in mssm, respectively.<n> these rates can be significantly enhanced in new physics models which lie within the expected sensitivity of the gigaz option of the international linear collider ( ilc ). <n> = # 1,nucl. <n> phys. <n> b * # 1"
+
+        EXPECTED_MAGNET = "a positive, nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the surface state of a topological insulator having a positive and finite effective g - factor. this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels, and persists up to room temperature, providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons."
+
+        generated = tokenizer.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+
+        self.assertTrue(generated == [EXPECTED_LEP, EXPECTED_MAGNET])
+
+
+class BigBirdPegasusStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=7,
+        d_model=32,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+        attention_type="original_full",
+        use_bias=True,
+        block_size=16,
+        num_random_blocks=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+        self.attention_type = attention_type
+        self.use_bias = use_bias
+        self.block_size = block_size
+        self.num_random_blocks = num_random_blocks
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = BigBirdPegasusConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+            attention_type=self.attention_type,
+            use_bias=self.use_bias,
+            block_size=self.block_size,
+            num_random_blocks=self.num_random_blocks,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = BigBirdPegasusDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = BigBirdPegasusDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        # big bird has extremely high logits which requires
+        # such a high error tolerance here
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=5e-1)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, lm_labels = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class BigBirdPegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BigBirdPegasusDecoder, BigBirdPegasusForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (BigBirdPegasusForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = BigBirdPegasusStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=BigBirdPegasusConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_blenderbot.py b/test_modeling_blenderbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfaa3cdc0a01dd44fb2f06cf35ac1fcd5fe5b967
--- /dev/null
+++ b/test_modeling_blenderbot.py
@@ -0,0 +1,531 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Blenderbot model. """
+
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
+    from transformers.models.blenderbot.modeling_blenderbot import (
+        BlenderbotDecoder,
+        BlenderbotEncoder,
+        BlenderbotForCausalLM,
+    )
+
+
+def prepare_blenderbot_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class BlenderbotModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BlenderbotConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BlenderbotModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = BlenderbotModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = BlenderbotEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = BlenderbotDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BlenderbotModel, BlenderbotForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (BlenderbotForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = BlenderbotModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BlenderbotForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+@unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.")
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class Blenderbot3BIntegrationTests(unittest.TestCase):
+    ckpt = "facebook/blenderbot-3B"
+
+    @cached_property
+    def tokenizer(self):
+        return BlenderbotTokenizer.from_pretrained(self.ckpt)
+
+    @slow
+    def test_generation_from_short_input_same_as_parlai_3B(self):
+        FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
+        TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+        torch.cuda.empty_cache()
+        model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt).half().to(torch_device)
+
+        src_text = ["Sam"]
+        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
+
+        generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
+        tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
+
+        generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
+        assert generated_txt[0].strip() == tgt_text
+
+        src_text = "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like i'm going to throw up.\nand why is that?"
+
+        model_inputs = self.tokenizer([src_text], return_tensors="pt").to(torch_device)
+
+        generated_ids = model.generate(**model_inputs, **FASTER_GEN_KWARGS)[0]
+        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
+
+        assert "I think it's because we are so worried about what people think of us." == reply.strip()
+        del model
+
+
+class BlenderbotStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        encoder_no_repeat_ngram_size=0,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+        self.encoder_no_repeat_ngram_size = encoder_no_repeat_ngram_size
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = BlenderbotConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+            encoder_no_repeat_ngram_size=self.encoder_no_repeat_ngram_size,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = BlenderbotDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = BlenderbotDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+        #        past_key_values = model(input_ids, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class BlenderbotStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BlenderbotDecoder, BlenderbotForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (BlenderbotForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = BlenderbotStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_blenderbot_small.py b/test_modeling_blenderbot_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5dc8c42076a2fa2e0da5e06319f9228669d9c7d
--- /dev/null
+++ b/test_modeling_blenderbot_small.py
@@ -0,0 +1,542 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BlenderbotSmall model. """
+
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BlenderbotSmallConfig,
+        BlenderbotSmallForConditionalGeneration,
+        BlenderbotSmallModel,
+        BlenderbotSmallTokenizer,
+    )
+    from transformers.models.blenderbot_small.modeling_blenderbot_small import (
+        BlenderbotSmallDecoder,
+        BlenderbotSmallEncoder,
+        BlenderbotSmallForCausalLM,
+    )
+
+
+def prepare_blenderbot_small_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class BlenderbotSmallModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BlenderbotSmallConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BlenderbotSmallModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = BlenderbotSmallModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = BlenderbotSmallEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = BlenderbotSmallDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (BlenderbotSmallForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = BlenderbotSmallModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BlenderbotSmallForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+@require_torch
+class Blenderbot90MIntegrationTests(unittest.TestCase):
+    ckpt = "facebook/blenderbot-90M"
+
+    @cached_property
+    def model(self):
+        model = BlenderbotSmallForConditionalGeneration.from_pretrained(self.ckpt).to(torch_device)
+        if torch_device == "cuda":
+            model = model.half()
+        return model
+
+    @cached_property
+    def tokenizer(self):
+        return BlenderbotSmallTokenizer.from_pretrained(self.ckpt)
+
+    @slow
+    def test_90_generation_from_long_input(self):
+
+        src_text = [
+            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like\
+       i'm going to throw up.\nand why is that?"
+        ]
+
+        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
+
+        assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
+        generated_ids = self.model.generate(**model_inputs)[0]
+        reply = self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+        assert reply in (
+            "i don't know. i just feel like i'm going to throw up. it's not fun.",
+            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
+        )
+
+    @slow
+    def test_90_generation_from_short_input(self):
+        model_inputs = self.tokenizer(["sam"], return_tensors="pt").to(torch_device)
+
+        generated_utterances = self.model.generate(**model_inputs)
+
+        clean_txt = self.tokenizer.decode(
+            generated_utterances[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        assert clean_txt in (
+            "have you ever been to a sam club? it's a great club in the south.",
+            "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
+        )
+
+
+class BlenderbotSmallStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = BlenderbotSmallConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = BlenderbotSmallDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = BlenderbotSmallDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class BlenderbotSmallStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BlenderbotSmallDecoder, BlenderbotSmallForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (BlenderbotSmallForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = BlenderbotSmallStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_bort.py b/test_modeling_bort.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ca940801074981d90cfdce8780f14ad3c4304e
--- /dev/null
+++ b/test_modeling_bort.py
@@ -0,0 +1,51 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModel
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class BortIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = AutoModel.from_pretrained("amazon/bort")
+        model.to(torch_device)
+
+        input_ids = torch.tensor(
+            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
+            device=torch_device,
+            dtype=torch.long,
+        )  # Schloß Nymphenburg in Munich is really nice!
+        output = model(input_ids)["last_hidden_state"]
+        expected_shape = torch.Size((1, 15, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
+            device=torch_device,
+            dtype=torch.float,
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_camembert.py b/test_modeling_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a40f6a8789eee0aa03a429148ba5cf537a58bd6
--- /dev/null
+++ b/test_modeling_camembert.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import CamembertModel
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class CamembertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = CamembertModel.from_pretrained("camembert-base")
+        model.to(torch_device)
+
+        input_ids = torch.tensor(
+            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
+            device=torch_device,
+            dtype=torch.long,
+        )  # J'aime le camembert !
+        output = model(input_ids)["last_hidden_state"]
+        expected_shape = torch.Size((1, 10, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
+            device=torch_device,
+            dtype=torch.float,
+        )
+        # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
+        # camembert.eval()
+        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_clip.py b/test_modeling_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..afcc5903c63d3a07cf6c42f7c027d0effba4930f
--- /dev/null
+++ b/test_modeling_clip.py
@@ -0,0 +1,563 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch CLIP model. """
+
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import requests
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel
+    from transformers.models.clip.modeling_clip import CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPProcessor
+
+
+class CLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = CLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, pixel_values
+
+    def create_and_check_model(self, config, pixel_values):
+        model = CLIPVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (CLIPVisionModel,) if is_torch_available() else ()
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CLIPVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # CLIP has a different seq_length
+            image_size = (self.model_tester.image_size, self.model_tester.image_size)
+            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    # skip this test as CLIPVisionModel has no base class and is
+    # not available in MODEL_MAPPING
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    # skip this test as CLIPVisionModel has no base class and is
+    # not available in MODEL_MAPPING
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLIPVisionModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class CLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = CLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, input_mask
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = CLIPTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (CLIPTextModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    def test_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+
+    # skip this test as CLIPTextModel has no base class and is
+    # not available in MODEL_MAPPING
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    # skip this test as CLIPTextModel has no base class and is
+    # not available in MODEL_MAPPING
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLIPTextModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+class CLIPModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = CLIPTextModelTester(parent)
+        self.vision_model_tester = CLIPVisionModelTester(parent)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = CLIPModel(config).to(torch_device).eval()
+        result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class CLIPModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPModel,) if is_torch_available() else ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = CLIPModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    # hidden_states are tested in individual model tests
+    def test_hidden_states_output(self):
+        pass
+
+    # input_embeds are tested in individual model tests
+    def test_inputs_embeds(self):
+        pass
+
+    # tested in individual model tests
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    # CLIPModel does not have input/output embeddings
+    def test_model_common_attributes(self):
+        pass
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # CLIP needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CLIPModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+class CLIPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPModel.from_pretrained(model_name).to(torch_device)
+        processor = CLIPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
diff --git a/test_modeling_common.py b/test_modeling_common.py
new file mode 100755
index 0000000000000000000000000000000000000000..56e5cddbc96c3885ca76cfeda12bbfdc8e16994a
--- /dev/null
+++ b/test_modeling_common.py
@@ -0,0 +1,1638 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import inspect
+import os.path
+import random
+import tempfile
+import unittest
+import warnings
+from typing import Dict, List, Tuple
+
+from huggingface_hub import HfApi
+from requests.exceptions import HTTPError
+from transformers import is_torch_available, logging
+from transformers.file_utils import WEIGHTS_NAME, is_torch_fx_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import (
+    ENDPOINT_STAGING,
+    PASS,
+    USER,
+    CaptureLogger,
+    is_staging_test,
+    require_torch,
+    require_torch_multi_gpu,
+    slow,
+    torch_device,
+)
+
+
+if is_torch_available():
+    import numpy as np
+    import torch
+    from torch import nn
+
+    from transformers import (
+        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MODEL_FOR_CAUSAL_LM_MAPPING,
+        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_MASKED_LM_MAPPING,
+        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_MAPPING,
+        AdaptiveEmbedding,
+        BertConfig,
+        BertModel,
+        PretrainedConfig,
+        PreTrainedModel,
+        T5ForConditionalGeneration,
+    )
+
+if is_torch_fx_available():
+    from transformers.modeling_fx_utils import symbolic_trace
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key or "initializer_factor" in key:
+            setattr(configs_no_init, key, 1e-10)
+    return configs_no_init
+
+
+TINY_T5 = "patrickvonplaten/t5-tiny-random"
+
+
+@require_torch
+class ModelTesterMixin:
+
+    model_tester = None
+    all_model_classes = ()
+    all_generative_model_classes = ()
+    fx_ready_model_classes = ()
+    test_torchscript = True
+    test_pruning = True
+    test_resize_embeddings = True
+    test_head_masking = True
+    test_missing_keys = True
+    test_model_parallel = False
+    is_encoder_decoder = False
+    test_sequence_classification_problem_types = False
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+            inputs_dict = {
+                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
+                if isinstance(v, torch.Tensor) and v.ndim > 1
+                else v
+                for k, v in inputs_dict.items()
+            }
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
+            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["start_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in [
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in [
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def test_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            out_2 = outputs[0].cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+                model.to(torch_device)
+                with torch.no_grad():
+                    after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+                # Make sure we don't have nans
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+    def test_save_load__keys_to_ignore_on_save(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None)
+            if _keys_to_ignore_on_save is None:
+                continue
+
+            # check the keys are in the original state_dict
+            for k in _keys_to_ignore_on_save:
+                self.assertIn(k, model.state_dict())
+
+            # check that certain keys didn't get saved with the model
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME)
+                state_dict_saved = torch.load(output_model_file)
+                for k in _keys_to_ignore_on_save:
+                    self.assertNotIn(k, state_dict_saved)
+
+                # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer.
+                load_result = model.load_state_dict(state_dict_saved, strict=False)
+                self.assertTrue(
+                    len(load_result.missing_keys) == 0
+                    or set(load_result.missing_keys) == set(model._keys_to_ignore_on_save)
+                )
+                self.assertTrue(len(load_result.unexpected_keys) == 0)
+
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+
+    def test_save_load_fast_init_from_base(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = MODEL_MAPPING[config.__class__]
+
+        if isinstance(base_class, tuple):
+            base_class = base_class[0]
+
+        for model_class in self.all_model_classes:
+            if model_class == base_class:
+                continue
+
+            # make a copy of model class to not break future tests
+            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
+            class CopyClass(model_class):
+                pass
+
+            model_class_copy = CopyClass
+
+            # make sure that all keys are expected for test
+            model_class_copy._keys_to_ignore_on_load_missing = []
+
+            # make init deterministic, but make sure that
+            # non-initialized weights throw errors nevertheless
+            model_class_copy._init_weights = self._mock_init_weights
+
+            model = base_class(config)
+            state_dict = model.state_dict()
+
+            # this will often delete a single weight of a multi-weight module
+            # to test an edge case
+            random_key_to_del = random.choice(list(state_dict.keys()))
+            del state_dict[random_key_to_del]
+
+            # check that certain keys didn't get saved with the model
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
+
+                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
+                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
+
+                for key in model_fast_init.state_dict().keys():
+                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    def test_save_load_fast_init_to_base(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        base_class = MODEL_MAPPING[config.__class__]
+
+        if isinstance(base_class, tuple):
+            base_class = base_class[0]
+
+        for model_class in self.all_model_classes:
+
+            if model_class == base_class:
+                continue
+
+            # make a copy of model class to not break future tests
+            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
+            class CopyClass(base_class):
+                pass
+
+            base_class_copy = CopyClass
+
+            # make sure that all keys are expected for test
+            base_class_copy._keys_to_ignore_on_load_missing = []
+
+            # make init deterministic, but make sure that
+            # non-initialized weights throw errors nevertheless
+            base_class_copy._init_weights = self._mock_init_weights
+
+            model = model_class(config)
+            state_dict = model.state_dict()
+
+            # this will often delete a single weight of a multi-weight module
+            # to test an edge case
+            random_key_to_del = random.choice(list(state_dict.keys()))
+            del state_dict[random_key_to_del]
+
+            # check that certain keys didn't get saved with the model
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.config.save_pretrained(tmpdirname)
+                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
+
+                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
+                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
+
+                for key in model_fast_init.state_dict().keys():
+                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
+                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["input_ids"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_training_gradient_checkpointing(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.model_tester.is_training or not hasattr(config, "gradient_checkpointing"):
+            return
+
+        config.gradient_checkpointing = True
+        config.use_cache = False
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            if model_class in get_values(MODEL_MAPPING):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Question Answering model returns start_logits and end_logits
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    @slow
+    def test_torchscript(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    @slow
+    def test_torchscript_output_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_attentions = True
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    @slow
+    def test_torchscript_output_hidden_state(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        self._create_and_check_torchscript(config, inputs_dict)
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    input_ids = inputs["input_ids"]
+                    attention_mask = inputs["attention_mask"]
+                    decoder_input_ids = inputs["decoder_input_ids"]
+                    decoder_attention_mask = inputs["decoder_attention_mask"]
+                    traced_model = torch.jit.trace(
+                        model, (input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
+                    )
+                else:
+                    input_ids = inputs["input_ids"]
+                    traced_model = torch.jit.trace(model, input_ids)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_torch_fx(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self._create_and_check_torch_fx_tracing(config, inputs_dict)
+
+    def test_torch_fx_output_loss(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        self._create_and_check_torch_fx_tracing(config, inputs_dict, output_loss=True)
+
+    def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False):
+        if not is_torch_fx_available():
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.return_dict = False
+
+        for model_class in self.fx_ready_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss)
+
+            try:
+                if model.config.is_encoder_decoder:
+                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                    input_ids = inputs["input_ids"]
+                    decoder_attention_mask = inputs["decoder_attention_mask"]
+                    labels = inputs.get("labels", None)
+                    input_names = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
+                    if labels is not None:
+                        input_names.append("labels")
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+
+                    model_output = model(**filtered_inputs)
+
+                    batch_size = input_ids.shape[0]
+                    encoder_sequence_length = input_ids.shape[1]
+                    decoder_sequence_length = decoder_attention_mask.shape[1]
+
+                    traced_model = symbolic_trace(
+                        model,
+                        input_names,
+                        batch_size=batch_size,
+                        sequence_length=[encoder_sequence_length, decoder_sequence_length],
+                    )
+
+                    traced_output = traced_model(**filtered_inputs)
+
+                else:
+                    input_names = ["input_ids", "attention_mask", "token_type_ids"]
+                    input_ids = inputs["input_ids"]
+
+                    labels = inputs.get("labels", None)
+                    start_positions = inputs.get("start_positions", None)
+                    end_positions = inputs.get("end_positions", None)
+                    if labels is not None:
+                        input_names.append("labels")
+                    if start_positions is not None:
+                        input_names.append("start_positions")
+                    if end_positions is not None:
+                        input_names.append("end_positions")
+
+                    filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names}
+                    input_names = filtered_inputs.keys()
+
+                    model_output = model(**filtered_inputs)
+
+                    rank = len(input_ids.shape)
+                    if rank == 2:
+                        batch_size, sequence_length = input_ids.shape
+                        num_choices = -1
+                    elif rank == 3:
+                        batch_size, num_choices, sequence_length = input_ids.shape
+                    else:
+                        raise NotImplementedError(
+                            f"symbolic_trace automatic parameters inference not implemented for input of rank {rank}."
+                        )
+
+                    traced_model = symbolic_trace(
+                        model,
+                        input_names,
+                        batch_size=batch_size,
+                        sequence_length=sequence_length,
+                        num_choices=num_choices,
+                    )
+                    traced_output = traced_model(**filtered_inputs)
+
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            def flatten_output(output):
+                flatten = []
+                for x in output:
+                    if isinstance(x, (tuple, list)):
+                        flatten += flatten_output(x)
+                    elif not isinstance(x, torch.Tensor):
+                        continue
+                    else:
+                        flatten.append(x)
+                return flatten
+
+            model_output = flatten_output(model_output)
+            traced_output = flatten_output(traced_output)
+            num_outputs = len(model_output)
+
+            for i in range(num_outputs):
+                self.assertTrue(
+                    torch.allclose(model_output[i], traced_output[i]),
+                    f"traced {i}th output doesn't match model {i}th output for {model_class}",
+                )
+
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            return
+
+        global_rng.seed(42)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        global_rng.seed()
+
+        inputs_dict["output_attentions"] = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            # Prepare head_mask
+            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
+            head_mask = torch.ones(
+                self.model_tester.num_hidden_layers,
+                self.model_tester.num_attention_heads,
+                device=torch_device,
+            )
+            head_mask[0, 0] = 0
+            head_mask[-1, :-1] = 0
+            head_mask.requires_grad_(requires_grad=True)
+            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
+            inputs["head_mask"] = head_mask
+            if model.config.is_encoder_decoder:
+                signature = inspect.signature(model.forward)
+                arg_names = [*signature.parameters.keys()]
+                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                    inputs["decoder_head_mask"] = head_mask
+                if "cross_attn_head_mask" in arg_names:
+                    inputs["cross_attn_head_mask"] = head_mask
+            outputs = model(**inputs, return_dict=True)
+
+            # Test that we can get a gradient back for importance score computation
+            output = sum(t.sum() for t in outputs[0])
+            output = output.sum()
+            output.backward()
+            multihead_outputs = head_mask.grad
+
+            self.assertIsNotNone(multihead_outputs)
+            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+
+            def check_attentions_validity(attentions):
+                # Remove Nan
+                for t in attentions:
+                    self.assertLess(
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
+
+                self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+                if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
+                    self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+
+            if model.config.is_encoder_decoder:
+                check_attentions_validity(outputs.encoder_attentions)
+                check_attentions_validity(outputs.decoder_attentions)
+                check_attentions_validity(outputs.cross_attentions)
+            else:
+                check_attentions_validity(outputs.attentions)
+
+    def test_head_pruning(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
+            model.prune_heads(heads_to_prune)
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+    def test_head_pruning_save_load_from_pretrained(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
+            model.prune_heads(heads_to_prune)
+
+            with tempfile.TemporaryDirectory() as temp_dir_name:
+                model.save_pretrained(temp_dir_name)
+                model = model_class.from_pretrained(temp_dir_name)
+                model.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+    def test_head_pruning_save_load_from_config_init(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+
+            heads_to_prune = {
+                0: list(range(1, self.model_tester.num_attention_heads)),
+                -1: [0],
+            }
+            config.pruned_heads = heads_to_prune
+
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+
+    def test_head_pruning_integration(self):
+        if not self.test_pruning:
+            return
+
+        for model_class in self.all_model_classes:
+            (
+                config,
+                inputs_dict,
+            ) = self.model_tester.prepare_config_and_inputs_for_common()
+
+            if "head_mask" in inputs_dict:
+                del inputs_dict["head_mask"]
+
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = False
+
+            heads_to_prune = {0: [0], 1: [1, 2]}
+            config.pruned_heads = heads_to_prune
+
+            model = model_class(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            with tempfile.TemporaryDirectory() as temp_dir_name:
+                model.save_pretrained(temp_dir_name)
+                model = model_class.from_pretrained(temp_dir_name)
+                model.to(torch_device)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            heads_to_prune = {0: [0], 2: [1, 2]}
+            model.prune_heads(heads_to_prune)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs[-1]
+
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
+            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
+
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        if config.is_encoder_decoder:
+            # Seq2Seq models
+            encoder_hidden_states = outputs.encoder_hidden_states[0]
+            encoder_attentions = outputs.encoder_attentions[0]
+            encoder_hidden_states.retain_grad()
+            encoder_attentions.retain_grad()
+
+            decoder_hidden_states = outputs.decoder_hidden_states[0]
+            decoder_attentions = outputs.decoder_attentions[0]
+            decoder_hidden_states.retain_grad()
+            decoder_attentions.retain_grad()
+
+            cross_attentions = outputs.cross_attentions[0]
+            cross_attentions.retain_grad()
+
+            output.flatten()[0].backward(retain_graph=True)
+
+            self.assertIsNotNone(encoder_hidden_states.grad)
+            self.assertIsNotNone(encoder_attentions.grad)
+            self.assertIsNotNone(decoder_hidden_states.grad)
+            self.assertIsNotNone(decoder_attentions.grad)
+            self.assertIsNotNone(cross_attentions.grad)
+        else:
+            # Encoder-/Decoder-only models
+            hidden_states = outputs.hidden_states[0]
+            attentions = outputs.attentions[0]
+
+            hidden_states.retain_grad()
+            attentions.retain_grad()
+
+            output.flatten()[0].backward(retain_graph=True)
+
+            self.assertIsNotNone(hidden_states.grad)
+            self.assertIsNotNone(attentions.grad)
+
+    def test_feed_forward_chunking(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            torch.manual_seed(0)
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            torch.manual_seed(0)
+            config.chunk_size_feed_forward = 1
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+            self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
+
+    def test_resize_tokens_embeddings(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+
+            # make sure that decoder_input_ids are resized as well
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_embeddings_untied(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding))
+            model.set_input_embeddings(nn.Embedding(10, 10))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_correct_missing_keys(self):
+        if not self.test_missing_keys:
+            return
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            base_model_prefix = model.base_model_prefix
+
+            if hasattr(model, base_model_prefix):
+                with tempfile.TemporaryDirectory() as temp_dir_name:
+                    model.base_model.save_pretrained(temp_dir_name)
+                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
+                    with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"):
+                        self.assertGreater(len(loading_info["missing_keys"]), 0)
+
+    def test_tie_model_weights(self):
+        if not self.test_torchscript:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_same_values(layer_1, layer_2):
+            equal = True
+            for p1, p2 in zip(layer_1.weight, layer_2.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    equal = False
+            return equal
+
+        for model_class in self.all_model_classes:
+            config.torchscript = True
+            model_not_tied = model_class(config)
+            if model_not_tied.get_output_embeddings() is None:
+                continue
+
+            config_tied = copy.deepcopy(config)
+            config_tied.torchscript = False
+            model_tied = model_class(config_tied)
+            params_tied = list(model_tied.parameters())
+            # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(check_same_values(embeddings, decoding))
+
+            # # Check that after modification, they remain the same.
+            # embeddings.weight.data.div_(2)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            # self.assertTrue(check_same_values(embeddings, decoding))
+
+            # # Check that after modification, they remain the same.
+            # decoding.weight.data.div_(4)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
+            # self.assertTrue(check_same_values(embeddings, decoding))
+
+            # Check that after resize they remain tied.
+            model_tied.resize_token_embeddings(config.vocab_size + 10)
+            params_tied_2 = list(model_tied.parameters())
+            self.assertEqual(len(params_tied_2), len(params_tied))
+
+            # decoding.weight.data.mul_(20)
+            # # Check that the embedding layer and decoding layer are the same in size and in value
+            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
+            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+            )
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    @require_torch_multi_gpu
+    def test_multi_gpu_data_parallel_forward(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # some params shouldn't be scattered by nn.DataParallel
+        # so just remove them if they are present.
+        blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
+        for k in blacklist_non_batched_params:
+            inputs_dict.pop(k, None)
+
+        # move input tensors to cuda:O
+        for k, v in inputs_dict.items():
+            if torch.is_tensor(v):
+                inputs_dict[k] = v.to(0)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=config)
+            model.to(0)
+            model.eval()
+
+            # Wrap model in nn.DataParallel
+            model = nn.DataParallel(model)
+            with torch.no_grad():
+                _ = model(**self._prepare_for_class(inputs_dict, model_class))
+
+    @require_torch_multi_gpu
+    def test_model_parallelization(self):
+        if not self.test_model_parallel:
+            return
+
+        # a candidate for testing_utils
+        def get_current_gpu_memory_use():
+            """returns a list of cuda memory allocations per GPU in MBs"""
+
+            per_device_memory = []
+            for id in range(torch.cuda.device_count()):
+                with torch.cuda.device(id):
+                    per_device_memory.append(torch.cuda.memory_allocated() >> 20)
+
+            return per_device_memory
+
+        # Needs a large model to see the difference.
+        config = self.model_tester.get_large_model_config()
+
+        for model_class in self.all_parallelizable_model_classes:
+            torch.cuda.empty_cache()
+
+            # 1. single gpu memory load + unload + memory measurements
+            # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
+            memory_at_start = get_current_gpu_memory_use()
+
+            # Put model on device 0 and take a memory snapshot
+            model = model_class(config)
+            model.to("cuda:0")
+            memory_after_model_load = get_current_gpu_memory_use()
+
+            # The memory use on device 0 should be higher than it was initially.
+            self.assertGreater(memory_after_model_load[0], memory_at_start[0])
+
+            del model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+            # 2. MP test
+            # it's essential to re-calibrate the usage before the next stage
+            memory_at_start = get_current_gpu_memory_use()
+
+            # Spread model layers over multiple devices
+            model = model_class(config)
+            model.parallelize()
+            memory_after_parallelization = get_current_gpu_memory_use()
+
+            # Assert that the memory use on all devices is higher than it was when loaded only on CPU
+            for n in range(torch.cuda.device_count()):
+                self.assertGreater(memory_after_parallelization[n], memory_at_start[n])
+
+            # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it
+            self.assertLess(memory_after_parallelization[0], memory_after_model_load[0])
+
+            # Assert that the memory use of device 1 is higher than it was when the entire model was loaded
+            # on device 0 and device 1 wasn't used at all
+            self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1])
+
+            del model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+    @require_torch_multi_gpu
+    def test_model_parallel_equal_results(self):
+        if not self.test_model_parallel:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_parallelizable_model_classes:
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+
+            def cast_to_device(dictionary, device):
+                output = {}
+                for k, v in dictionary.items():
+                    if isinstance(v, torch.Tensor):
+                        output[k] = v.to(device)
+                    else:
+                        output[k] = v
+
+                return output
+
+            model = model_class(config)
+            output = model(**cast_to_device(inputs_dict, "cpu"))
+
+            model.parallelize()
+
+            parallel_output = model(**cast_to_device(inputs_dict, "cuda:0"))
+
+            for value, parallel_value in zip(output, parallel_output):
+                if isinstance(value, torch.Tensor):
+                    self.assertTrue(torch.allclose(value, parallel_value.to("cpu"), atol=1e-7))
+                elif isinstance(value, (Tuple, List)):
+                    for value_, parallel_value_ in zip(value, parallel_value):
+                        self.assertTrue(torch.allclose(value_, parallel_value_.to("cpu"), atol=1e-7))
+
+    @require_torch_multi_gpu
+    def test_model_parallel_beam_search(self):
+        if not self.test_model_parallel:
+            return
+
+        all_generative_and_parallelizable_model_classes = tuple(
+            set(self.all_generative_model_classes).intersection(self.all_parallelizable_model_classes)
+        )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in all_generative_and_parallelizable_model_classes:
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+
+            def cast_to_device(dictionary, device):
+                output = {}
+                for k, v in dictionary.items():
+                    if isinstance(v, torch.Tensor):
+                        output[k] = v.to(device)
+                    else:
+                        output[k] = v
+
+                return output
+
+            model.parallelize()
+            model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2)
+
+    def test_problem_types(self):
+        if not self.test_sequence_classification_problem_types:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        problem_types = [
+            {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float},
+            {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long},
+            {"title": "regression", "num_labels": 1, "dtype": torch.float},
+        ]
+
+        for model_class in self.all_model_classes:
+            if model_class not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+                continue
+
+            for problem_type in problem_types:
+                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
+
+                    config.problem_type = problem_type["title"]
+                    config.num_labels = problem_type["num_labels"]
+
+                    model = model_class(config)
+                    model.to(torch_device)
+                    model.train()
+
+                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+
+                    if problem_type["num_labels"] > 1:
+                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
+
+                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
+
+                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
+                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
+                    # they have the same size." which is a symptom something in wrong for the regression problem.
+                    # See https://github.com/huggingface/transformers/issues/11780
+                    with warnings.catch_warnings(record=True) as warning_list:
+                        loss = model(**inputs).loss
+                    self.assertListEqual(warning_list, [])
+
+                    loss.backward()
+
+
+global_rng = random.Random()
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    #  Creates a random int32 tensor of the shape within the vocab size
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+
+
+def random_attention_mask(shape, rng=None, name=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
+    # make sure that at least one token is attended to for each batch
+    attn_mask[:, -1] = 1
+    return attn_mask
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
+
+
+@require_torch
+class ModelUtilsTest(unittest.TestCase):
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+
+            # Not sure this is the intended behavior. TODO fix Lysandre & Thom
+            config.name_or_path = model_name
+
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+    def test_model_from_pretrained_with_different_pretrained_model_name(self):
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+        self.assertIsNotNone(model)
+
+        logger = logging.get_logger("transformers.configuration_utils")
+        with CaptureLogger(logger) as cl:
+            BertModel.from_pretrained(TINY_T5)
+        self.assertTrue("You are using a model of type t5 to instantiate a model of type bert" in cl.out)
+
+
+@require_torch
+@is_staging_test
+class ModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._api = HfApi(endpoint=ENDPOINT_STAGING)
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-model")
+        except HTTPError:
+            pass
+
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-model-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-model", use_auth_token=self._token)
+
+            new_model = BertModel.from_pretrained(f"{USER}/test-model")
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = BertModel(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir,
+                push_to_hub=True,
+                repo_name="test-model-org",
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
+
+            new_model = BertModel.from_pretrained("valid_org/test-model-org")
+            for p1, p2 in zip(model.parameters(), new_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
diff --git a/test_modeling_convbert.py b/test_modeling_convbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe7188755133c120ce2c5486245f9bd9e8c7729
--- /dev/null
+++ b/test_modeling_convbert.py
@@ -0,0 +1,433 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ConvBERT model. """
+
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        ConvBertConfig,
+        ConvBertForMaskedLM,
+        ConvBertForMultipleChoice,
+        ConvBertForQuestionAnswering,
+        ConvBertForSequenceClassification,
+        ConvBertForTokenClassification,
+        ConvBertModel,
+    )
+    from transformers.models.convbert.modeling_convbert import CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class ConvBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = ConvBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ConvBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ConvBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ConvBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ConvBertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = ConvBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = ConvBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            ConvBertModel,
+            ConvBertForMaskedLM,
+            ConvBertForMultipleChoice,
+            ConvBertForQuestionAnswering,
+            ConvBertForSequenceClassification,
+            ConvBertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_head_masking = False
+    test_sequence_classification_problem_types = True
+
+    def setUp(self):
+        self.model_tester = ConvBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ConvBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Question Answering model returns start_logits and end_logits
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length],
+                )
+
+
+@require_torch
+class ConvBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = ConvBertModel.from_pretrained("YituTech/conv-bert-base")
+        input_ids = torch.tensor([[1, 2, 3, 4, 5, 6]])
+        output = model(input_ids)[0]
+
+        expected_shape = torch.Size((1, 6, 768))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-0.0864, -0.4898, -0.3677], [0.1434, -0.2952, -0.7640], [-0.0112, -0.4432, -0.5432]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_ctrl.py b/test_modeling_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2254623561c8049a17809651bbcbf22490cb578
--- /dev/null
+++ b/test_modeling_ctrl.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        CTRLConfig,
+        CTRLForSequenceClassification,
+        CTRLLMHeadModel,
+        CTRLModel,
+    )
+
+
+class CTRLModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 14
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = CTRLConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CTRLModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CTRLLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
+
+        return config, inputs_dict
+
+    def create_and_check_ctrl_for_sequence_classification(self, config, input_ids, head_mask, token_type_ids, *args):
+        config.num_labels = self.num_labels
+        model = CTRLForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+
+@require_torch
+class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification) if is_torch_available() else ()
+    all_generative_model_classes = (CTRLLMHeadModel,) if is_torch_available() else ()
+    test_pruning = True
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = CTRLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_ctrl_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
+
+    def test_ctrl_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in CTRL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = CTRLModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class CTRLModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_ctrl(self):
+        model = CTRLLMHeadModel.from_pretrained("ctrl")
+        model.to(torch_device)
+        input_ids = torch.tensor(
+            [[11859, 0, 1611, 8]], dtype=torch.long, device=torch_device
+        )  # Legal the president is
+        expected_output_ids = [
+            11859,
+            0,
+            1611,
+            8,
+            5,
+            150,
+            26449,
+            2,
+            19,
+            348,
+            469,
+            3,
+            2595,
+            48,
+            20740,
+            246533,
+            246533,
+            19,
+            30,
+            5,
+        ]  # Legal the president is a good guy and I don't want to lose my job. \n \n I have a
+
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/test_modeling_deberta.py b/test_modeling_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c66617b884c462ed46fd62c965a3b130b6e0ab2
--- /dev/null
+++ b/test_modeling_deberta.py
@@ -0,0 +1,283 @@
+# coding=utf-8
+# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DebertaConfig,
+        DebertaForMaskedLM,
+        DebertaForQuestionAnswering,
+        DebertaForSequenceClassification,
+        DebertaForTokenClassification,
+        DebertaModel,
+    )
+    from transformers.models.deberta.modeling_deberta import DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+@require_torch
+class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DebertaModel,
+            DebertaForMaskedLM,
+            DebertaForSequenceClassification,
+            DebertaForTokenClassification,
+            DebertaForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    is_encoder_decoder = False
+
+    class DebertaModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            relative_attention=False,
+            position_biased_input=True,
+            pos_att_type="None",
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.relative_attention = relative_attention
+            self.position_biased_input = position_biased_input
+            self.pos_att_type = pos_att_type
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = DebertaConfig(
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range,
+                relative_attention=self.relative_attention,
+                position_biased_input=self.position_biased_input,
+                pos_att_type=self.pos_att_type,
+            )
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(list(result.loss.size()), [])
+
+        def create_and_check_deberta_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DebertaModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
+            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
+            sequence_output = model(input_ids)[0]
+
+            self.parent.assertListEqual(
+                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+
+        def create_and_check_deberta_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DebertaForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+        def create_and_check_deberta_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = DebertaForSequenceClassification(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+        def create_and_check_deberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = DebertaForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+        def create_and_check_deberta_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DebertaForQuestionAnswering(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
+            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = DebertaModelTest.DebertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_deberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_model(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DebertaModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class DebertaModelIntegrationTest(unittest.TestCase):
+    @unittest.skip(reason="Model not available yet")
+    def test_inference_masked_lm(self):
+        pass
+
+    @slow
+    def test_inference_no_head(self):
+        model = DebertaModel.from_pretrained("microsoft/deberta-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.5986, -0.8055, -0.8462], [1.4484, -0.9348, -0.8059], [0.3123, 0.0032, -1.4131]]]
+        )
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4), f"{output[:, 1:4, 1:4]}")
diff --git a/test_modeling_deberta_v2.py b/test_modeling_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..718682edb36dda3ed82851df75d53fa29226e975
--- /dev/null
+++ b/test_modeling_deberta_v2.py
@@ -0,0 +1,283 @@
+# coding=utf-8
+# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DebertaV2Config,
+        DebertaV2ForMaskedLM,
+        DebertaV2ForQuestionAnswering,
+        DebertaV2ForSequenceClassification,
+        DebertaV2ForTokenClassification,
+        DebertaV2Model,
+    )
+    from transformers.models.deberta_v2.modeling_deberta_v2 import DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+@require_torch
+class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DebertaV2Model,
+            DebertaV2ForMaskedLM,
+            DebertaV2ForSequenceClassification,
+            DebertaV2ForTokenClassification,
+            DebertaV2ForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    is_encoder_decoder = False
+
+    class DebertaV2ModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            relative_attention=False,
+            position_biased_input=True,
+            pos_att_type="None",
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.relative_attention = relative_attention
+            self.position_biased_input = position_biased_input
+            self.pos_att_type = pos_att_type
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = DebertaV2Config(
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range,
+                relative_attention=self.relative_attention,
+                position_biased_input=self.position_biased_input,
+                pos_att_type=self.pos_att_type,
+            )
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(list(result.loss.size()), [])
+
+        def create_and_check_deberta_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DebertaV2Model(config=config)
+            model.to(torch_device)
+            model.eval()
+            sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
+            sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
+            sequence_output = model(input_ids)[0]
+
+            self.parent.assertListEqual(
+                list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+
+        def create_and_check_deberta_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DebertaV2ForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+        def create_and_check_deberta_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = DebertaV2ForSequenceClassification(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+        def create_and_check_deberta_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = DebertaV2ForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+        def create_and_check_deberta_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DebertaV2ForQuestionAnswering(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(
+                input_ids,
+                attention_mask=input_mask,
+                token_type_ids=token_type_ids,
+                start_positions=sequence_labels,
+                end_positions=sequence_labels,
+            )
+            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = DebertaV2ModelTest.DebertaV2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_deberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_model(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DebertaV2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class DebertaV2ModelIntegrationTest(unittest.TestCase):
+    @unittest.skip(reason="Model not available yet")
+    def test_inference_masked_lm(self):
+        pass
+
+    @slow
+    def test_inference_no_head(self):
+        model = DebertaV2Model.from_pretrained("microsoft/deberta-v2-xlarge")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[0.2356, 0.1948, 0.0369], [-0.1063, 0.3586, -0.5152], [-0.6399, -0.0259, -0.2525]]]
+        )
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4), f"{output[:, 1:4, 1:4]}")
diff --git a/test_modeling_deit.py b/test_modeling_deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb24f84cf0ef320ac11bb3538e0011b740385ca
--- /dev/null
+++ b/test_modeling_deit.py
@@ -0,0 +1,397 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch DeiT model. """
+
+
+import inspect
+import unittest
+
+from transformers.file_utils import cached_property, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        MODEL_MAPPING,
+        DeiTConfig,
+        DeiTForImageClassification,
+        DeiTForImageClassificationWithTeacher,
+        DeiTModel,
+    )
+    from transformers.models.deit.modeling_deit import DEIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DeiTFeatureExtractor
+
+
+class DeiTModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = DeiTConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, pixel_values, labels
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = DeiTModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected sequence length = num_patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        image_size = to_2tuple(self.image_size)
+        patch_size = to_2tuple(self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 2, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = DeiTForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class DeiTModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as DeiT does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            DeiTModel,
+            DeiTForImageClassification,
+            DeiTForImageClassificationWithTeacher,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = DeiTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_inputs_embeds(self):
+        # DeiT does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in DeiT, the seq_len equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        image_size = to_2tuple(self.model_tester.image_size)
+        patch_size = to_2tuple(self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 2
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # DeiT has a different seq_length
+            image_size = to_2tuple(self.model_tester.image_size)
+            patch_size = to_2tuple(self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 2
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # special case for DeiTForImageClassificationWithTeacher model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "DeiTForImageClassificationWithTeacher":
+                del inputs_dict["labels"]
+
+        return inputs_dict
+
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            # DeiTForImageClassificationWithTeacher supports inference-only
+            if (
+                model_class in MODEL_MAPPING.values()
+                or model_class.__name__ == "DeiTForImageClassificationWithTeacher"
+            ):
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DEIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DeiTModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_vision
+class DeiTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+            if is_vision_available()
+            else None
+        )
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = DeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224").to(
+            torch_device
+        )
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-1.0266, 0.1912, -1.2861]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_detr.py b/test_modeling_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..093e75cf993661be388d2ddb894ef2be77167f91
--- /dev/null
+++ b/test_modeling_detr.py
@@ -0,0 +1,527 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch DETR model. """
+
+
+import inspect
+import math
+import unittest
+
+from transformers import is_timm_available, is_vision_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+
+
+if is_timm_available():
+    import torch
+
+    from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrModel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import DetrFeatureExtractor
+
+
+@require_timm
+class DetrModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=8,
+        is_training=True,
+        use_labels=True,
+        hidden_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        num_queries=12,
+        num_channels=3,
+        min_size=200,
+        max_size=200,
+        n_targets=8,
+        num_labels=91,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_queries = num_queries
+        self.num_channels = num_channels
+        self.min_size = min_size
+        self.max_size = max_size
+        self.n_targets = n_targets
+        self.num_labels = num_labels
+
+        # we also set the expected seq length for both encoder and decoder
+        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
+        self.decoder_seq_length = self.num_queries
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
+
+        pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device)
+
+        labels = None
+        if self.use_labels:
+            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
+            labels = []
+            for i in range(self.batch_size):
+                target = {}
+                target["class_labels"] = torch.randint(
+                    high=self.num_labels, size=(self.n_targets,), device=torch_device
+                )
+                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device)
+                labels.append(target)
+
+        config = DetrConfig(
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            num_queries=self.num_queries,
+            num_labels=self.num_labels,
+        )
+        return config, pixel_values, pixel_mask, labels
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
+        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
+        return config, inputs_dict
+
+    def create_and_check_detr_model(self, config, pixel_values, pixel_mask, labels):
+        model = DetrModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
+        model = DetrForObjectDetection(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+        result = model(pixel_values)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
+        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
+
+
+@require_timm
+class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            DetrModel,
+            DetrForObjectDetection,
+            DetrForSegmentation,
+        )
+        if is_timm_available()
+        else ()
+    )
+    is_encoder_decoder = True
+    test_torchscript = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    # special case for head models
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ in ["DetrForObjectDetection", "DetrForSegmentation"]:
+                labels = []
+                for i in range(self.model_tester.batch_size):
+                    target = {}
+                    target["class_labels"] = torch.ones(
+                        size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
+                    )
+                    target["boxes"] = torch.ones(
+                        self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
+                    )
+                    target["masks"] = torch.ones(
+                        self.model_tester.n_targets,
+                        self.model_tester.min_size,
+                        self.model_tester.max_size,
+                        device=torch_device,
+                        dtype=torch.float,
+                    )
+                    labels.append(target)
+                inputs_dict["labels"] = labels
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = DetrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DetrConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_detr_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_detr_model(*config_and_inputs)
+
+    def test_detr_object_detection_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs)
+
+    @unittest.skip(reason="DETR does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="DETR does not have a get_input_embeddings method")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="DETR is not a generative model")
+    def test_generate_without_input_ids(self):
+        pass
+
+    @unittest.skip(reason="DETR does not use token embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        decoder_seq_length = self.model_tester.decoder_seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        decoder_key_length = self.model_tester.decoder_seq_length
+        encoder_key_length = self.model_tester.encoder_seq_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                # Object Detection model returns pred_logits and pred_boxes
+                if model_class.__name__ == "DetrForObjectDetection":
+                    correct_outlen += 2
+                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
+                if model_class.__name__ == "DetrForSegmentation":
+                    correct_outlen += 3
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        decoder_attentions = outputs.decoder_attentions[0]
+        decoder_attentions.retain_grad()
+
+        cross_attentions = outputs.cross_attentions[0]
+        cross_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+        self.assertIsNotNone(decoder_attentions.grad)
+        self.assertIsNotNone(cross_attentions.grad)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
+                    if "head_mask" and "decoder_head_mask" in arg_names
+                    else []
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["pixel_values", "pixel_mask"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_different_timm_backbone(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # let's pick a random timm backbone
+        config.backbone = "tf_mobilenetv3_small_075"
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if model_class.__name__ == "DetrForObjectDetection":
+                expected_shape = (
+                    self.model_tester.batch_size,
+                    self.model_tester.num_queries,
+                    self.model_tester.num_labels + 1,
+                )
+                self.assertEqual(outputs.logits.shape, expected_shape)
+
+            self.assertTrue(outputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        configs_no_init.init_xavier_std = 1e9
+
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if "bbox_attention" in name and "bias" not in name:
+                        self.assertLess(
+                            100000,
+                            abs(param.data.max().item()),
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+
+TOLERANCE = 1e-4
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_timm
+@require_vision
+@slow
+class DetrModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
+
+    def test_inference_no_head(self):
+        model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 100, 256))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_object_detection_head(self):
+        model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+    def test_inference_panoptic_segmentation_head(self):
+        model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        encoding = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        pixel_values = encoding["pixel_values"].to(torch_device)
+        pixel_mask = encoding["pixel_mask"].to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(pixel_values, pixel_mask)
+
+        expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1))
+        self.assertEqual(outputs.logits.shape, expected_shape_logits)
+        expected_slice_logits = torch.tensor(
+            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
+
+        expected_shape_boxes = torch.Size((1, model.config.num_queries, 4))
+        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
+        expected_slice_boxes = torch.tensor(
+            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+        expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267))
+        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
+        expected_slice_masks = torch.tensor(
+            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, atol=1e-4))
diff --git a/test_modeling_distilbert.py b/test_modeling_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..269cadf957c07a1d6628a0003d137ea2c6fd55aa
--- /dev/null
+++ b/test_modeling_distilbert.py
@@ -0,0 +1,269 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        DistilBertConfig,
+        DistilBertForMaskedLM,
+        DistilBertForMultipleChoice,
+        DistilBertForQuestionAnswering,
+        DistilBertForSequenceClassification,
+        DistilBertForTokenClassification,
+        DistilBertModel,
+    )
+
+    class DistilBertModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = DistilBertConfig(
+                vocab_size=self.vocab_size,
+                dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                hidden_dim=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range,
+            )
+
+            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_distilbert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DistilBertModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, input_mask)
+            result = model(input_ids)
+            self.parent.assertEqual(
+                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
+            )
+
+        def create_and_check_distilbert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DistilBertForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+        def create_and_check_distilbert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = DistilBertForQuestionAnswering(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(
+                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+            )
+            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+        def create_and_check_distilbert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = DistilBertForSequenceClassification(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+        def create_and_check_distilbert_for_token_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = DistilBertForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+        def create_and_check_distilbert_for_multiple_choice(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_choices = self.num_choices
+            model = DistilBertForMultipleChoice(config=config)
+            model.to(torch_device)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            result = model(
+                multiple_choice_inputs_ids,
+                attention_mask=multiple_choice_input_mask,
+                labels=choice_labels,
+            )
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+
+@require_torch
+class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DistilBertModel,
+            DistilBertForMaskedLM,
+            DistilBertForMultipleChoice,
+            DistilBertForQuestionAnswering,
+            DistilBertForSequenceClassification,
+            DistilBertForTokenClassification,
+        )
+        if is_torch_available()
+        else None
+    )
+    fx_ready_model_classes = all_model_classes
+    test_pruning = True
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_sequence_classification_problem_types = True
+
+    def setUp(self):
+        self.model_tester = DistilBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_distilbert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DistilBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class DistilBertModelIntergrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-0.1639, 0.3299, 0.1648], [-0.1746, 0.3289, 0.1710], [-0.1884, 0.3357, 0.1810]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/test_modeling_dpr.py b/test_modeling_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c9844b4be0ad38a501309807135d4e6c813d39
--- /dev/null
+++ b/test_modeling_dpr.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer
+    from transformers.models.dpr.modeling_dpr import (
+        DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+class DPRModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=False,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        projection_dim=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.projection_dim = projection_dim
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = DPRConfig(
+            projection_dim=self.projection_dim,
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_context_encoder(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DPRContextEncoder(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
+
+    def create_and_check_question_encoder(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DPRQuestionEncoder(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
+
+    def create_and_check_reader(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DPRReader(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+        )
+
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids}
+        return config, inputs_dict
+
+
+@require_torch
+class DPRModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DPRContextEncoder,
+            DPRQuestionEncoder,
+            DPRReader,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    test_resize_embeddings = False
+    test_missing_keys = False  # why?
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = DPRModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_context_encoder_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_context_encoder(*config_and_inputs)
+
+    def test_question_encoder_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_question_encoder(*config_and_inputs)
+
+    def test_reader_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reader(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DPRContextEncoder.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+        for model_name in DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DPRContextEncoder.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+        for model_name in DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DPRQuestionEncoder.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+        for model_name in DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DPRReader.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class DPRModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", return_dict=False)
+        model.to(torch_device)
+
+        input_ids = torch.tensor(
+            [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]], dtype=torch.long, device=torch_device
+        )  # [CLS] hello, is my dog cute? [SEP]
+        output = model(input_ids)[0]  # embedding shape = (1, 768)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [
+                [
+                    0.03236253,
+                    0.12753335,
+                    0.16818509,
+                    0.00279786,
+                    0.3896933,
+                    0.24264945,
+                    0.2178971,
+                    -0.02335227,
+                    -0.08481959,
+                    -0.14324117,
+                ]
+            ],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output[:, :10], expected_slice, atol=1e-4))
+
+    @slow
+    def test_reader_inference(self):
+        tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
+        model.to(torch_device)
+
+        encoded_inputs = tokenizer(
+            questions="What is love ?",
+            titles="Haddaway",
+            texts="What Is Love is a song recorded by the artist Haddaway",
+            padding=True,
+            return_tensors="pt",
+        )
+        encoded_inputs.to(torch_device)
+
+        outputs = model(**encoded_inputs)
+
+        # compare the actual values for a slice.
+        expected_start_logits = torch.tensor(
+            [[-10.3005, -10.7765, -11.4872, -11.6841, -11.9312, -10.3002, -9.8544, -11.7378, -12.0821, -10.2975]],
+            dtype=torch.float,
+            device=torch_device,
+        )
+
+        expected_end_logits = torch.tensor(
+            [[-11.0684, -11.7041, -11.5397, -10.3465, -10.8791, -6.8443, -11.9959, -11.0364, -10.0096, -6.8405]],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(outputs.start_logits[:, :10], expected_start_logits, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.end_logits[:, :10], expected_end_logits, atol=1e-4))
diff --git a/test_modeling_electra.py b/test_modeling_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcbb445a190c94e5655009b2d32fbf0f1bae42c
--- /dev/null
+++ b/test_modeling_electra.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        ElectraConfig,
+        ElectraForMaskedLM,
+        ElectraForMultipleChoice,
+        ElectraForPreTraining,
+        ElectraForQuestionAnswering,
+        ElectraForSequenceClassification,
+        ElectraForTokenClassification,
+        ElectraModel,
+    )
+    from transformers.models.electra.modeling_electra import ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class ElectraModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
+
+        config = ElectraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        )
+
+    def create_and_check_electra_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = ElectraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_electra_for_masked_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = ElectraForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_electra_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = ElectraForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_electra_for_pretraining(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = ElectraForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_electra_for_sequence_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = ElectraForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_electra_for_question_answering(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = ElectraForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_electra_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_choices = self.num_choices
+        model = ElectraForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            ElectraModel,
+            ElectraForPreTraining,
+            ElectraForMaskedLM,
+            ElectraForMultipleChoice,
+            ElectraForTokenClassification,
+            ElectraForSequenceClassification,
+            ElectraForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_ready_model_classes = all_model_classes
+    test_sequence_classification_problem_types = True
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = ElectraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_electra_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_model(*config_and_inputs)
+
+    def test_electra_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_electra_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs)
+
+    def test_for_pre_training(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_pretraining(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_sequence_classification(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_question_answering(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ElectraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class ElectraModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = ElectraModel.from_pretrained("google/electra-small-discriminator")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        expected_shape = torch.Size((1, 11, 256))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[0.4471, 0.6821, -0.3265], [0.4627, 0.5255, -0.3668], [0.4532, 0.3313, -0.4344]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
diff --git a/test_modeling_encoder_decoder.py b/test_modeling_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..26e381180d9d0da92ebf04f61f4de1deeb56e879
--- /dev/null
+++ b/test_modeling_encoder_decoder.py
@@ -0,0 +1,922 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_modeling_bart import BartStandaloneDecoderModelTester
+from .test_modeling_bert import BertModelTester
+from .test_modeling_bert_generation import BertGenerationEncoderTester
+from .test_modeling_common import ids_tensor
+from .test_modeling_gpt2 import GPT2ModelTester
+from .test_modeling_prophetnet import ProphetNetStandaloneDecoderModelTester
+from .test_modeling_roberta import RobertaModelTester
+
+
+if is_torch_available():
+    import numpy as np
+    import torch
+
+    from transformers import (
+        AutoConfig,
+        AutoTokenizer,
+        BartForCausalLM,
+        BertGenerationDecoder,
+        BertGenerationEncoder,
+        BertLMHeadModel,
+        BertModel,
+        BertTokenizer,
+        EncoderDecoderConfig,
+        EncoderDecoderModel,
+        GPT2LMHeadModel,
+        ProphetNetForCausalLM,
+        RobertaForCausalLM,
+        RobertaModel,
+    )
+    from transformers.modeling_outputs import BaseModelOutput
+
+
+@require_torch
+class EncoderDecoderMixin:
+    def get_encoder_decoder_model(self, config, decoder_config):
+        pass
+
+    def prepare_config_and_inputs(self):
+        pass
+
+    def get_pretrained_model(self):
+        pass
+
+    def check_encoder_decoder_model_from_pretrained_configs(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs
+    ):
+        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
+        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
+
+        enc_dec_model = EncoderDecoderModel(encoder_decoder_config)
+        enc_dec_model.to(torch_device)
+        enc_dec_model.eval()
+
+        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
+
+        outputs_encoder_decoder = enc_dec_model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
+
+    def check_encoder_decoder_model(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs
+    ):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        self.assertTrue(enc_dec_model.config.decoder.is_decoder)
+        self.assertTrue(enc_dec_model.config.decoder.add_cross_attention)
+        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
+        enc_dec_model.to(torch_device)
+        outputs_encoder_decoder = enc_dec_model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
+
+        encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states)
+        outputs_encoder_decoder = enc_dec_model(
+            encoder_outputs=encoder_outputs,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
+
+    def check_encoder_decoder_model_from_pretrained(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        return_dict,
+        **kwargs
+    ):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
+        enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
+        enc_dec_model.to(torch_device)
+        outputs_encoder_decoder = enc_dec_model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            return_dict=True,
+        )
+
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
+
+    def check_save_and_load(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs
+    ):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        enc_dec_model.to(torch_device)
+        enc_dec_model.eval()
+        with torch.no_grad():
+            outputs = enc_dec_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+            out_2 = outputs[0].cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                enc_dec_model.save_pretrained(tmpdirname)
+                EncoderDecoderModel.from_pretrained(tmpdirname)
+
+                after_outputs = enc_dec_model(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                    decoder_attention_mask=decoder_attention_mask,
+                )
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+    def check_save_and_load_encoder_decoder_model(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        **kwargs
+    ):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        enc_dec_model.to(torch_device)
+        enc_dec_model.eval()
+        with torch.no_grad():
+            outputs = enc_dec_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+            out_2 = outputs[0].cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+                enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
+                enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
+                EncoderDecoderModel.from_encoder_decoder_pretrained(
+                    encoder_pretrained_model_name_or_path=encoder_tmp_dirname,
+                    decoder_pretrained_model_name_or_path=decoder_tmp_dirname,
+                )
+
+                after_outputs = enc_dec_model(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                    decoder_attention_mask=decoder_attention_mask,
+                )
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+    def check_encoder_decoder_model_labels(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        labels,
+        **kwargs
+    ):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        enc_dec_model.to(torch_device)
+        outputs_encoder_decoder = enc_dec_model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=labels,
+        )
+
+        loss = outputs_encoder_decoder["loss"]
+        # check that backprop works
+        loss.backward()
+
+        self.assertEqual(
+            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
+        )
+        self.assertEqual(
+            outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
+        )
+
+    def check_encoder_decoder_model_output_attentions(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        labels,
+        **kwargs
+    ):
+        # make the decoder inputs a different shape from the encoder inputs to harden the test
+        decoder_input_ids = decoder_input_ids[:, :-1]
+        decoder_attention_mask = decoder_attention_mask[:, :-1]
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        enc_dec_model.to(torch_device)
+        outputs_encoder_decoder = enc_dec_model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            output_attentions=True,
+        )
+
+        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
+        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
+
+        self.assertEqual(
+            encoder_attentions[0].shape[-3:], (config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1])
+        )
+
+        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
+        num_decoder_layers = (
+            decoder_config.num_decoder_layers
+            if hasattr(decoder_config, "num_decoder_layers")
+            else decoder_config.num_hidden_layers
+        )
+        self.assertEqual(len(decoder_attentions), num_decoder_layers)
+
+        self.assertEqual(
+            decoder_attentions[0].shape[-3:],
+            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
+        )
+
+        cross_attentions = outputs_encoder_decoder["cross_attentions"]
+        self.assertEqual(len(cross_attentions), num_decoder_layers)
+
+        cross_attention_input_seq_len = decoder_input_ids.shape[-1] * (
+            1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0)
+        )
+        self.assertEqual(
+            cross_attentions[0].shape[-3:],
+            (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]),
+        )
+
+    def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        enc_dec_model.to(torch_device)
+
+        # Bert does not have a bos token id, so use pad_token_id instead
+        generated_output = enc_dec_model.generate(
+            input_ids, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id
+        )
+        self.assertEqual(generated_output.shape, (input_ids.shape[0],) + (decoder_config.max_length,))
+
+    def create_and_check_encoder_decoder_shared_weights(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        labels,
+        **kwargs
+    ):
+        torch.manual_seed(0)
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        model.to(torch_device)
+        model.eval()
+        # load state dict copies weights but does not tie them
+        decoder_state_dict = model.decoder._modules[model.decoder.base_model_prefix].state_dict()
+        model.encoder.load_state_dict(decoder_state_dict, strict=False)
+
+        torch.manual_seed(0)
+        tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        config = EncoderDecoderConfig.from_encoder_decoder_configs(
+            tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True
+        )
+        tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config)
+        tied_model.to(torch_device)
+        tied_model.eval()
+
+        model_result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+
+        tied_model_result = tied_model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+
+        # check that models has less parameters
+        self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters()))
+        random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+        # check that outputs are equal
+        self.assertTrue(
+            torch.allclose(
+                model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
+            )
+        )
+
+        # check that outputs after saving and loading are equal
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tied_model.save_pretrained(tmpdirname)
+            tied_model = EncoderDecoderModel.from_pretrained(tmpdirname)
+            tied_model.to(torch_device)
+            tied_model.eval()
+
+            # check that models has less parameters
+            self.assertLess(
+                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+            )
+            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+            tied_model_result = tied_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # check that outputs are equal
+            self.assertTrue(
+                torch.allclose(
+                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
+                )
+            )
+
+    def test_encoder_decoder_model(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model(**input_ids_dict)
+
+    def test_encoder_decoder_model_from_pretrained_configs(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
+
+    def test_encoder_decoder_model_from_pretrained(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False)
+
+    def test_encoder_decoder_model_from_pretrained_return_dict(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True)
+
+    def test_save_and_load_from_pretrained(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_save_and_load(**input_ids_dict)
+
+    def test_save_and_load_from_encoder_decoder_pretrained(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_save_and_load_encoder_decoder_model(**input_ids_dict)
+
+    def test_encoder_decoder_model_labels(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_labels(**input_ids_dict)
+
+    def test_encoder_decoder_model_output_attentions(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
+
+    def test_encoder_decoder_model_generate(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_generate(**input_ids_dict)
+
+    def test_encoder_decoder_model_shared_weights(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.create_and_check_encoder_decoder_shared_weights(**input_ids_dict)
+
+    @slow
+    def test_real_model_save_load_from_pretrained(self):
+        model_2 = self.get_pretrained_model()
+        model_2.to(torch_device)
+        input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size)
+        decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size)
+        attention_mask = ids_tensor([13, 5], vocab_size=2)
+        with torch.no_grad():
+            outputs = model_2(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+            )
+            out_2 = outputs[0].cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with tempfile.TemporaryDirectory() as tmp_dirname:
+                model_2.save_pretrained(tmp_dirname)
+                model_1 = EncoderDecoderModel.from_pretrained(tmp_dirname)
+                model_1.to(torch_device)
+
+                after_outputs = model_1(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                )
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+
+@require_torch
+class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "bert-base-cased")
+
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = BertModel(config)
+        decoder_model = BertLMHeadModel(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester = BertModelTester(self)
+        encoder_config_and_inputs = model_tester.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_token_type_ids,
+            decoder_input_mask,
+            decoder_sequence_labels,
+            decoder_token_labels,
+            decoder_choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_token_type_ids": decoder_token_type_ids,
+            "decoder_attention_mask": decoder_input_mask,
+            "decoder_sequence_labels": decoder_sequence_labels,
+            "decoder_token_labels": decoder_token_labels,
+            "decoder_choice_labels": decoder_choice_labels,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": decoder_token_labels,
+        }
+
+    @slow
+    def test_bert2bert_summarization(self):
+        model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+        model.to(torch_device)
+        tokenizer = BertTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+
+        ARTICLE_SIGMA = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents."""
+
+        ARTICLE_AMERICA = """(CNN) -- The 2013 America's Cup will be faster than ever after organizers announced that wingsail catamarans will be the vessels of choice. The race has historically been between yachts with a single hull, however the 34th edition of the contest will be between multi-hull vessels with wings rather than traditional sails. This means the boats will travel faster through the water, with top speeds in excess of 30 knots, almost three times as fast as in the past. The Golden Gate Yacht Club, hosts of the 2013 race and holders of the cup, have also announced a new, shorter race format for the competition. In an attempt to boost interest in one of sailing's showpiece events an annual World Series will also take place, starting in 2011, resulting a world champion team being crowned. In addition, a youth America's Cup will also be introduced, set to begin in 2012. In a statement on the International Sailing Federation (ISAF) website, the CEO of 2010's winning syndicate BMW ORACLE Racing Russell Coutts explained the reasons behind the changes. "We believe this new format and new boat will put the America's Cup back at the pinnacle of our sport," said Coutts. "These changes will give equal opportunity to competitors and long-term economic stability to all teams and all commercial partners. We promised fairness and innovation and this is what we've delivered." The statement also explained how, in addition to generating interest in the contest, the new annual America's Cup World Series will provide increased commercial revenue for the teams and their sponsors. The venue for the 2013 contest is not due to be announced until the end of the year, with San Francisco, Valencia and a location near Rome believed to be under consideration. Vincenzo Onorato, President of the 2013 challengers Mascalzone Latino, supported the changes: "I think that we need to acknowledge that the Defender has kept its word. The America's Cup is going to have fair rules and a truly independent management of the racing."""
+
+        EXPECTED_SUMMARY_SIGMA = """sae was founded in 1856, five years before the civil war. the fraternity has had to work hard to change recently. the university of oklahoma president says the university's affiliation with the fraternity is permanently done. the sae has had a string of members in recent months."""
+
+        EXPECTED_SUMMARY_AMERICA = """the 2013 america's cup will be faster than ever. the 34th edition of the competition will be held in 2011. the 2013 race will be between multi - hull vessels with wings rather than traditional sails. the new america'' cup will provide increased commercial revenue. the event will also be expanded to a youth america'cup."""
+
+        input_dict = tokenizer(
+            [ARTICLE_SIGMA, ARTICLE_AMERICA],
+            padding="max_length",
+            pad_to_max_length=True,
+            max_length=512,
+            return_tensors="pt",
+        )
+        output_ids = model.generate(
+            input_dict["input_ids"].to(torch_device), attention_mask=input_dict["attention_mask"].to(torch_device)
+        )
+        summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        self.assertEqual(summary, [EXPECTED_SUMMARY_SIGMA, EXPECTED_SUMMARY_AMERICA])
+
+
+@require_torch
+class BertGenerationEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "google/bert_for_seq_generation_L-24_bbc_encoder", "google/bert_for_seq_generation_L-24_bbc_encoder"
+        )
+
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = BertGenerationEncoder(config)
+        decoder_model = BertGenerationDecoder(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester = BertGenerationEncoderTester(self)
+        encoder_config_and_inputs = model_tester.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            input_mask,
+            token_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_input_mask,
+            decoder_token_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_input_mask,
+            "decoder_token_labels": decoder_token_labels,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": decoder_token_labels,
+        }
+
+    @slow
+    def test_roberta2roberta_summarization(self):
+        model = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_bbc")
+        model.to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_bbc")
+
+        ARTICLE_PS3 = """The problem is affecting people using the older versions of the PlayStation 3, called the "Fat" model.The problem isn't affecting the newer PS3 Slim systems that have been on sale since September last year.Sony have also said they are aiming to have the problem fixed shortly but is advising some users to avoid using their console for the time being."We hope to resolve this problem within the next 24 hours," a statement reads. "In the meantime, if you have a model other than the new slim PS3, we advise that you do not use your PS3 system, as doing so may result in errors in some functionality, such as recording obtained trophies, and not being able to restore certain data."We believe we have identified that this problem is being caused by a bug in the clock functionality incorporated in the system."The PlayStation Network is used by millions of people around the world.It allows users to play their friends at games like Fifa over the internet and also do things like download software or visit online stores."""
+
+        ARTICLE_TOSHIBA = """An independent panel appointed by Toshiba found institutional accounting irregularities, the firm said in a statement to investors. Toshiba said it "takes the situation it has caused very seriously" and that it "deeply apologised" to shareholders. The overstatement was roughly triple an initial Toshiba estimate. The probe could lead to a restatement of earnings, a board overhaul and potential action by regulators. "Within Toshiba, there was a corporate culture in which one could not go against the wishes of superiors," the report said. "Therefore, when top management presented 'challenges', division presidents, line managers and employees below them continually carried out inappropriate accounting practices to meet targets in line with the wishes of their superiors." The improper accounting practices stretched back to 2008."""
+
+        EXPECTED_SUMMARY_PS3 = """Sony has said that a bug in its PlayStation 3 console is preventing them from using the machine as a computer."""
+
+        EXPECTED_SUMMARY_TOSHIBA = """Japanese electronics giant Toshiba overstated its annual earnings by more than a third last year, according to a report."""
+
+        input_dict = tokenizer(
+            [ARTICLE_PS3, ARTICLE_TOSHIBA], max_length=512, padding="max_length", return_tensors="pt"
+        )
+        output_ids = model.generate(
+            input_dict["input_ids"].to(torch_device), attention_mask=input_dict["attention_mask"].to(torch_device)
+        )
+        summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        self.assertEqual(summary, [EXPECTED_SUMMARY_PS3, EXPECTED_SUMMARY_TOSHIBA])
+
+
+@require_torch
+class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = RobertaModel(config)
+        decoder_model = RobertaForCausalLM(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester = RobertaModelTester(self)
+        encoder_config_and_inputs = model_tester.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_token_type_ids,
+            decoder_input_mask,
+            decoder_sequence_labels,
+            decoder_token_labels,
+            decoder_choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_token_type_ids": decoder_token_type_ids,
+            "decoder_attention_mask": decoder_input_mask,
+            "decoder_sequence_labels": decoder_sequence_labels,
+            "decoder_token_labels": decoder_token_labels,
+            "decoder_choice_labels": decoder_choice_labels,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": decoder_token_labels,
+        }
+
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base")
+
+
+@require_torch
+class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = BertModel(config)
+        decoder_model = GPT2LMHeadModel(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester_encoder = BertModelTester(self, batch_size=13)
+        model_tester_decoder = GPT2ModelTester(self, batch_size=13)
+        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_input_mask,
+            decoder_head_mask,
+            decoder_token_type_ids,
+            decoder_sequence_labels,
+            decoder_token_labels,
+            decoder_choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        #  disable cache for now
+        decoder_config.use_cache = False
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_token_type_ids": decoder_token_type_ids,
+            "decoder_attention_mask": decoder_input_mask,
+            "decoder_sequence_labels": decoder_sequence_labels,
+            "decoder_token_labels": decoder_token_labels,
+            "decoder_choice_labels": decoder_choice_labels,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": decoder_token_labels,
+        }
+
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
+
+    def test_encoder_decoder_model_shared_weights(self):
+        pass
+
+
+@require_torch
+class ProphetNetEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = BertModel(config)
+        decoder_model = ProphetNetForCausalLM(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester_encoder = BertModelTester(self, batch_size=13)
+        model_tester_decoder = ProphetNetStandaloneDecoderModelTester(
+            self, batch_size=13, hidden_size=32, max_position_embeddings=512
+        )
+        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        #  disable cache for now
+        decoder_config.use_cache = False
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": lm_labels,
+        }
+
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained(
+            "bert-large-uncased", "microsoft/prophetnet-large-uncased"
+        )
+
+    def test_encoder_decoder_model_shared_weights(self):
+        pass
+
+
+@require_torch
+class BartEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
+    def get_encoder_decoder_model(self, config, decoder_config):
+        encoder_model = BertModel(config)
+        decoder_model = BartForCausalLM(decoder_config)
+        return encoder_model, decoder_model
+
+    def prepare_config_and_inputs(self):
+        model_tester_encoder = BertModelTester(self, batch_size=13)
+        model_tester_decoder = BartStandaloneDecoderModelTester(
+            self, batch_size=13, d_model=32, max_position_embeddings=512
+        )
+        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
+        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = encoder_config_and_inputs
+        (
+            decoder_config,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        ) = decoder_config_and_inputs
+
+        # make sure that cross attention layers are added
+        decoder_config.add_cross_attention = True
+        #  disable cache for now
+        decoder_config.use_cache = False
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_config": decoder_config,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "labels": lm_labels,
+        }
+
+    def get_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "facebook/bart-large")
+
+    def test_encoder_decoder_model_shared_weights(self):
+        pass
+
+
+@require_torch
+class EncoderDecoderModelTest(unittest.TestCase):
+    def get_from_encoderdecoder_pretrained_model(self):
+        return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
+
+    def get_decoder_config(self):
+        config = AutoConfig.from_pretrained("bert-base-uncased")
+        config.is_decoder = True
+        config.add_cross_attention = True
+        return config
+
+    def get_encoderdecoder_model(self):
+        return EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+
+    def get_encoder_decoder_models(self):
+        encoder_model = BertModel.from_pretrained("bert-base-uncased")
+        decoder_model = BertLMHeadModel.from_pretrained("bert-base-uncased", config=self.get_decoder_config())
+        return {"encoder": encoder_model, "decoder": decoder_model}
+
+    def _check_configuration_tie(self, model):
+        assert id(model.decoder.config) == id(model.config.decoder)
+        assert id(model.encoder.config) == id(model.config.encoder)
+
+    @slow
+    def test_configuration_tie(self):
+        model = self.get_from_encoderdecoder_pretrained_model()
+        self._check_configuration_tie(model)
+
+        model = EncoderDecoderModel(**self.get_encoder_decoder_models())
+        self._check_configuration_tie(model)
+
+        model = self.get_encoderdecoder_model()
+        self._check_configuration_tie(model)
diff --git a/test_modeling_flaubert.py b/test_modeling_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5f2d6805e0714ddc7e35332786225225dd6b71
--- /dev/null
+++ b/test_modeling_flaubert.py
@@ -0,0 +1,417 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        FlaubertConfig,
+        FlaubertForMultipleChoice,
+        FlaubertForQuestionAnswering,
+        FlaubertForQuestionAnsweringSimple,
+        FlaubertForSequenceClassification,
+        FlaubertForTokenClassification,
+        FlaubertModel,
+        FlaubertWithLMHeadModel,
+    )
+    from transformers.models.flaubert.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class FlaubertModelTester(object):
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_lengths = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.gelu_activation = True
+        self.sinusoidal_embeddings = False
+        self.causal = False
+        self.asm = False
+        self.n_langs = 2
+        self.vocab_size = 99
+        self.n_special = 0
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 12
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.summary_type = "last"
+        self.use_proj = None
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        input_lengths = None
+        if self.use_input_lengths:
+            input_lengths = (
+                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+            )  # small variation of seq_length
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+        sequence_labels = None
+        token_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = FlaubertConfig(
+            vocab_size=self.vocab_size,
+            n_special=self.n_special,
+            emb_dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            gelu_activation=self.gelu_activation,
+            sinusoidal_embeddings=self.sinusoidal_embeddings,
+            asm=self.asm,
+            causal=self.causal,
+            n_langs=self.n_langs,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            summary_type=self.summary_type,
+            use_proj=self.use_proj,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        )
+
+    def create_and_check_flaubert_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = FlaubertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, lengths=input_lengths, langs=token_type_ids)
+        result = model(input_ids, langs=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_flaubert_lm_head(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = FlaubertWithLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_flaubert_simple_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = FlaubertForQuestionAnsweringSimple(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        result = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_flaubert_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = FlaubertForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        result_with_labels = model(
+            input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+            p_mask=input_mask,
+        )
+
+        result_with_labels = model(
+            input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+        )
+
+        (total_loss,) = result_with_labels.to_tuple()
+
+        result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
+
+        (total_loss,) = result_with_labels.to_tuple()
+
+        self.parent.assertEqual(result_with_labels.loss.shape, ())
+        self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top))
+        self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top))
+        self.parent.assertEqual(
+            result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
+        )
+        self.parent.assertEqual(
+            result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
+        )
+        self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,))
+
+    def create_and_check_flaubert_sequence_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = FlaubertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+        result = model(input_ids, labels=sequence_labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def create_and_check_flaubert_token_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = FlaubertForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_flaubert_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = FlaubertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
+        return config, inputs_dict
+
+
+@require_torch
+class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            FlaubertModel,
+            FlaubertWithLMHeadModel,
+            FlaubertForQuestionAnswering,
+            FlaubertForQuestionAnsweringSimple,
+            FlaubertForSequenceClassification,
+            FlaubertForTokenClassification,
+            FlaubertForMultipleChoice,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    # Flaubert has 2 QA models -> need to manually set the correct labels for one of them here
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "FlaubertForQuestionAnswering":
+                inputs_dict["start_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = FlaubertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_flaubert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
+
+    def test_flaubert_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs)
+
+    def test_flaubert_simple_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_simple_qa(*config_and_inputs)
+
+    def test_flaubert_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_qa(*config_and_inputs)
+
+    def test_flaubert_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs)
+
+    def test_flaubert_token_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_token_classif(*config_and_inputs)
+
+    def test_flaubert_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = FlaubertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class FlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = FlaubertModel.from_pretrained("flaubert/flaubert_base_cased")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-2.6251, -1.4298, -0.0227], [-2.8510, -1.6387, 0.2258], [-2.8114, -1.1832, -0.3066]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_flax_bart.py b/test_modeling_flax_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..f446c4556f29510e77fc516ed2f9b19578db515a
--- /dev/null
+++ b/test_modeling_flax_bart.py
@@ -0,0 +1,417 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import timeout_decorator  # noqa
+
+from transformers import BartConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from .test_generation_flax_utils import FlaxGenerationTesterMixin
+from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
+
+
+if is_flax_available():
+    import os
+
+    # The slow tests are often failing with OOM error on GPU
+    # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
+    # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
+    os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
+
+    import jax
+    import jax.numpy as jnp
+    from transformers.models.bart.modeling_flax_bart import (
+        FlaxBartForConditionalGeneration,
+        FlaxBartForQuestionAnswering,
+        FlaxBartForSequenceClassification,
+        FlaxBartModel,
+        shift_tokens_right,
+    )
+
+
+def prepare_bart_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = np.where(input_ids != config.pad_token_id, 1, 0)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = np.where(decoder_input_ids != config.pad_token_id, 1, 0)
+    if head_mask is None:
+        head_mask = np.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = np.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
+
+
+class FlaxBartModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=32,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        initializer_range=0.02,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.initializer_range = initializer_range
+
+    def prepare_config_and_inputs(self):
+        input_ids = np.clip(ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size), 3, self.vocab_size)
+        input_ids = np.concatenate((input_ids, 2 * np.ones((self.batch_size, 1), dtype=np.int64)), -1)
+
+        decoder_input_ids = shift_tokens_right(input_ids, 1, 2)
+
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            initializer_range=self.initializer_range,
+            use_cache=False,
+        )
+        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def check_use_cache_forward(self, model_class_name, config, inputs_dict):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        encoder_outputs = model.encode(inputs_dict["input_ids"])
+
+        decoder_input_ids, decoder_attention_mask = (
+            inputs_dict["decoder_input_ids"],
+            inputs_dict["decoder_attention_mask"],
+        )
+
+        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
+        decoder_attention_mask = jnp.ones((decoder_input_ids.shape[0], max_decoder_length), dtype="i4")
+
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
+            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
+        )
+        outputs_cache = model.decode(
+            decoder_input_ids[:, :-1],
+            encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            decoder_position_ids=decoder_position_ids,
+        )
+
+        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model.decode(
+            decoder_input_ids[:, -1:],
+            encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=outputs_cache.past_key_values,
+            decoder_position_ids=decoder_position_ids,
+        )
+
+        outputs = model.decode(decoder_input_ids, encoder_outputs)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, inputs_dict):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        encoder_outputs = model.encode(inputs_dict["input_ids"])
+
+        decoder_input_ids, decoder_attention_mask = (
+            inputs_dict["decoder_input_ids"],
+            inputs_dict["decoder_attention_mask"],
+        )
+
+        decoder_attention_mask_cache = jnp.concatenate(
+            [
+                decoder_attention_mask,
+                jnp.zeros((decoder_attention_mask.shape[0], max_decoder_length - decoder_attention_mask.shape[1])),
+            ],
+            axis=-1,
+        )
+
+        past_key_values = model.init_cache(decoder_input_ids.shape[0], max_decoder_length, encoder_outputs)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(decoder_input_ids.shape[-1] - 1)[None, :],
+            (decoder_input_ids.shape[0], decoder_input_ids.shape[-1] - 1),
+        )
+
+        outputs_cache = model.decode(
+            decoder_input_ids[:, :-1],
+            encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask_cache,
+            past_key_values=past_key_values,
+            decoder_position_ids=decoder_position_ids,
+        )
+        decoder_position_ids = jnp.array(decoder_input_ids.shape[0] * [[decoder_input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model.decode(
+            decoder_input_ids[:, -1:],
+            encoder_outputs,
+            past_key_values=outputs_cache.past_key_values,
+            decoder_attention_mask=decoder_attention_mask_cache,
+            decoder_position_ids=decoder_position_ids,
+        )
+
+        outputs = model.decode(decoder_input_ids, encoder_outputs, decoder_attention_mask=decoder_attention_mask)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+
+@require_flax
+class BartHeadTests(unittest.TestCase):
+    vocab_size = 99
+
+    def _get_config_and_data(self):
+        input_ids = np.array(
+            [
+                [71, 82, 18, 33, 46, 91, 2],
+                [68, 34, 26, 58, 30, 82, 2],
+                [5, 97, 17, 39, 94, 40, 2],
+                [76, 83, 94, 25, 70, 78, 2],
+                [87, 59, 41, 35, 48, 66, 2],
+                [55, 13, 16, 58, 5, 2, 1],  # note padding
+                [64, 27, 31, 51, 12, 75, 2],
+                [52, 64, 86, 17, 83, 39, 2],
+                [48, 61, 9, 24, 71, 82, 2],
+                [26, 1, 60, 48, 22, 13, 2],
+                [21, 5, 62, 28, 14, 76, 2],
+                [45, 98, 37, 86, 59, 48, 2],
+                [70, 70, 50, 9, 28, 0, 2],
+            ],
+            dtype=np.int64,
+        )
+
+        batch_size = input_ids.shape[0]
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+        )
+        return config, input_ids, batch_size
+
+    def test_sequence_classification_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        model = FlaxBartForSequenceClassification(config)
+        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+        expected_shape = (batch_size, config.num_labels)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+
+    def test_question_answering_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        model = FlaxBartForQuestionAnswering(config)
+        outputs = model(input_ids=input_ids)
+
+        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
+        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
+
+    # @timeout_decorator.timeout(1)  # not working with the decorator so far
+    def test_lm_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        lm_model = FlaxBartForConditionalGeneration(config)
+        outputs = lm_model(input_ids=input_ids)
+        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+
+    def test_lm_uneven_forward(self):
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=14,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=8,
+            decoder_ffn_dim=8,
+            max_position_embeddings=48,
+        )
+        lm_model = FlaxBartForConditionalGeneration(config)
+        context = np.array([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=np.int64)
+        summary = np.array([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=np.int64)
+        outputs = lm_model(input_ids=context, decoder_input_ids=summary)
+        expected_shape = (*summary.shape, config.vocab_size)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+
+    def test_shift_tokens_right(self):
+        input_ids = np.array([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=np.int64)
+        shifted = shift_tokens_right(input_ids, 1, 2)
+        n_pad_before = np.equal(input_ids, 1).astype(np.float32).sum()
+        n_pad_after = np.equal(shifted, 1).astype(np.float32).sum()
+        self.assertEqual(shifted.shape, input_ids.shape)
+        self.assertEqual(n_pad_after, n_pad_before - 1)
+        self.assertTrue(np.equal(shifted[:, 0], 2).all())
+
+
+@require_flax
+class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+    is_encoder_decoder = True
+    all_model_classes = (
+        (
+            FlaxBartModel,
+            FlaxBartForConditionalGeneration,
+            FlaxBartForSequenceClassification,
+            FlaxBartForQuestionAnswering,
+        )
+        if is_flax_available()
+        else ()
+    )
+    all_generative_model_classes = (FlaxBartForConditionalGeneration,) if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxBartModelTester(self)
+
+    def test_use_cache_forward(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            self.model_tester.check_use_cache_forward(model_class, config, inputs_dict)
+
+    def test_use_cache_forward_with_attn_mask(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            self.model_tester.check_use_cache_forward_with_attn_mask(model_class, config, inputs_dict)
+
+    def test_encode(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def encode_jitted(input_ids, attention_mask=None, **kwargs):
+                    return model.encode(input_ids=input_ids, attention_mask=attention_mask)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = encode_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    def test_decode(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                model = model_class(config)
+                encoder_outputs = model.encode(inputs_dict["input_ids"], inputs_dict["attention_mask"])
+
+                prepared_inputs_dict = {
+                    "decoder_input_ids": inputs_dict["decoder_input_ids"],
+                    "decoder_attention_mask": inputs_dict["decoder_attention_mask"],
+                    "encoder_outputs": encoder_outputs,
+                }
+
+                @jax.jit
+                def decode_jitted(decoder_input_ids, decoder_attention_mask, encoder_outputs):
+                    return model.decode(
+                        decoder_input_ids=decoder_input_ids,
+                        decoder_attention_mask=decoder_attention_mask,
+                        encoder_outputs=encoder_outputs,
+                    )
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = decode_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("facebook/bart-base", from_pt=True)
+            # FlaxBartForSequenceClassification expects eos token in input_ids
+            input_ids = np.ones((1, 1)) * model.config.eos_token_id
+            outputs = model(input_ids)
+            self.assertIsNotNone(outputs)
diff --git a/test_modeling_flax_bert.py b/test_modeling_flax_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..273f55d157d241acc224e44c5632822deac5c39e
--- /dev/null
+++ b/test_modeling_flax_bert.py
@@ -0,0 +1,145 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import BertConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    from transformers.models.bert.modeling_flax_bert import (
+        FlaxBertForMaskedLM,
+        FlaxBertForMultipleChoice,
+        FlaxBertForNextSentencePrediction,
+        FlaxBertForPreTraining,
+        FlaxBertForQuestionAnswering,
+        FlaxBertForSequenceClassification,
+        FlaxBertForTokenClassification,
+        FlaxBertModel,
+    )
+
+
+class FlaxBertModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_attention_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_choices = num_choices
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = BertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, attention_mask
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            FlaxBertModel,
+            FlaxBertForPreTraining,
+            FlaxBertForMaskedLM,
+            FlaxBertForMultipleChoice,
+            FlaxBertForQuestionAnswering,
+            FlaxBertForNextSentencePrediction,
+            FlaxBertForSequenceClassification,
+            FlaxBertForTokenClassification,
+            FlaxBertForQuestionAnswering,
+        )
+        if is_flax_available()
+        else ()
+    )
+
+    def setUp(self):
+        self.model_tester = FlaxBertModelTester(self)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("bert-base-cased", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
diff --git a/test_modeling_flax_big_bird.py b/test_modeling_flax_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95a2df278d6cfff2646c188cec5e8ae7ee3dc50
--- /dev/null
+++ b/test_modeling_flax_big_bird.py
@@ -0,0 +1,164 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import BigBirdConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    from transformers.models.big_bird.modeling_flax_big_bird import (
+        FlaxBigBirdForMaskedLM,
+        FlaxBigBirdForMultipleChoice,
+        FlaxBigBirdForPreTraining,
+        FlaxBigBirdForQuestionAnswering,
+        FlaxBigBirdForSequenceClassification,
+        FlaxBigBirdForTokenClassification,
+        FlaxBigBirdModel,
+    )
+
+
+class FlaxBigBirdModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=56,
+        is_training=True,
+        use_attention_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_choices=4,
+        attention_type="block_sparse",
+        use_bias=True,
+        rescale_embeddings=False,
+        block_size=4,
+        num_random_blocks=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_choices = num_choices
+
+        self.rescale_embeddings = rescale_embeddings
+        self.attention_type = attention_type
+        self.use_bias = use_bias
+        self.block_size = block_size
+        self.num_random_blocks = num_random_blocks
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = BigBirdConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            attention_type=self.attention_type,
+            block_size=self.block_size,
+            num_random_blocks=self.num_random_blocks,
+            use_bias=self.use_bias,
+            rescale_embeddings=self.rescale_embeddings,
+        )
+
+        return config, input_ids, token_type_ids, attention_mask
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxBigBirdModelTest(FlaxModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            FlaxBigBirdModel,
+            FlaxBigBirdForPreTraining,
+            FlaxBigBirdForMaskedLM,
+            FlaxBigBirdForMultipleChoice,
+            FlaxBigBirdForQuestionAnswering,
+            FlaxBigBirdForSequenceClassification,
+            FlaxBigBirdForTokenClassification,
+        )
+        if is_flax_available()
+        else ()
+    )
+
+    test_attn_probs = False
+
+    def setUp(self):
+        self.model_tester = FlaxBigBirdModelTester(self)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("google/bigbird-roberta-base", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
+
+    def test_attention_outputs(self):
+        if self.test_attn_probs:
+            super().test_attention_outputs()
diff --git a/test_modeling_flax_clip.py b/test_modeling_flax_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a82b94ca9a9a633cdca7fc29837e6d31b36826d
--- /dev/null
+++ b/test_modeling_flax_clip.py
@@ -0,0 +1,513 @@
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+
+import transformers
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+
+from .test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+    from transformers.models.clip.modeling_flax_clip import FlaxCLIPModel, FlaxCLIPTextModel, FlaxCLIPVisionModel
+
+if is_torch_available():
+    import torch
+
+
+class FlaxCLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = CLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxCLIPVisionModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (FlaxCLIPVisionModel,) if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxCLIPVisionModelTester(self)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(pixel_values, **kwargs):
+                    return model(pixel_values=pixel_values, **kwargs).to_tuple()
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict)
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict)
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.hidden_states
+
+            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+
+            # CLIP has a different seq_length
+            image_size = (self.model_tester.image_size, self.model_tester.image_size)
+            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_length = num_patches + 1
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
+            outputs = model(np.ones((1, 3, 224, 224)))
+            self.assertIsNotNone(outputs)
+
+
+class FlaxCLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = CLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, input_mask
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxCLIPTextModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxCLIPTextModel,) if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxCLIPTextModelTester(self)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
+
+
+class FlaxCLIPModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = FlaxCLIPTextModelTester(parent)
+        self.vision_model_tester = FlaxCLIPVisionModelTester(parent)
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxCLIPModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxCLIPModel,) if is_flax_available() else ()
+    test_attention_outputs = False
+
+    def setUp(self):
+        self.model_tester = FlaxCLIPModelTester(self)
+
+    # hidden_states are tested in individual model tests
+    def test_hidden_states_output(self):
+        pass
+
+    @slow
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(input_ids, pixel_values, **kwargs):
+                    return model(input_ids=input_ids, pixel_values=pixel_values, **kwargs).to_tuple()
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict)
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict)
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs[:4], outputs[:4]):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_ids", "pixel_values", "attention_mask", "position_ids"]
+            self.assertListEqual(arg_names[:4], expected_arg_names)
+
+    def test_get_image_features(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FlaxCLIPModel(config)
+
+        @jax.jit
+        def model_jitted(pixel_values):
+            return model.get_image_features(pixel_values=pixel_values)
+
+        with self.subTest("JIT Enabled"):
+            jitted_output = model_jitted(inputs_dict["pixel_values"])
+
+        with self.subTest("JIT Disabled"):
+            with jax.disable_jit():
+                output = model_jitted(inputs_dict["pixel_values"])
+
+        self.assertEqual(jitted_output.shape, output.shape)
+        self.assertTrue(np.allclose(jitted_output, output, atol=1e-3))
+
+    def test_get_text_features(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FlaxCLIPModel(config)
+
+        @jax.jit
+        def model_jitted(input_ids, attention_mask, **kwargs):
+            return model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
+
+        with self.subTest("JIT Enabled"):
+            jitted_output = model_jitted(**inputs_dict)
+
+        with self.subTest("JIT Disabled"):
+            with jax.disable_jit():
+                output = model_jitted(**inputs_dict)
+
+        self.assertEqual(jitted_output.shape, output.shape)
+        self.assertTrue(np.allclose(jitted_output, output, atol=1e-3))
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
+            outputs = model(input_ids=np.ones((1, 1)), pixel_values=np.ones((1, 3, 224, 224)))
+            self.assertIsNotNone(outputs)
+
+    # overwrite from common since FlaxCLIPModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+                # PyTorch CLIPModel returns loss, we skip it here as we don't return loss in JAX/Flax models
+                pt_outputs = pt_outputs[1:]
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
+
+    # overwrite from common since FlaxCLIPModel returns nested output
+    # which is not supported in the common test
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+                # PyTorch CLIPModel returns loss, we skip it here as we don't return loss in JAX/Flax models
+                pt_outputs = pt_outputs[1:]
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+                pt_outputs_loaded = pt_outputs_loaded[1:]
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
diff --git a/test_modeling_flax_common.py b/test_modeling_flax_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..10cc1f453802f06b6f4104e7b1d1a146dcecad23
--- /dev/null
+++ b/test_modeling_flax_common.py
@@ -0,0 +1,463 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import random
+import tempfile
+from typing import List, Tuple
+
+import numpy as np
+
+import transformers
+from transformers import is_flax_available, is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+
+
+if is_flax_available():
+    import os
+
+    import jax
+    import jax.numpy as jnp
+    import jaxlib.xla_extension as jax_xla
+    from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+
+    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
+
+if is_torch_available():
+    import torch
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key or "initializer_factor" in key:
+            setattr(configs_no_init, key, 1e-10)
+    return configs_no_init
+
+
+def ids_tensor(shape, vocab_size, rng=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    output = np.array(values, dtype=jnp.int32).reshape(shape)
+
+    return output
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return np.array(values, dtype=jnp.float32).reshape(shape)
+
+
+def random_attention_mask(shape, rng=None):
+    attn_mask = ids_tensor(shape, vocab_size=2, rng=rng)
+    # make sure that at least one token is attended to for each batch
+    attn_mask[:, -1] = 1
+    return attn_mask
+
+
+@require_flax
+class FlaxModelTesterMixin:
+    model_tester = None
+    all_model_classes = ()
+    is_encoder_decoder = False
+
+    def _prepare_for_class(self, inputs_dict, model_class):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        # hack for now until we have AutoModel classes
+        if "ForMultipleChoice" in model_class.__name__:
+            inputs_dict = {
+                k: jnp.broadcast_to(v[:, None], (v.shape[0], self.model_tester.num_choices, v.shape[-1]))
+                if isinstance(v, (jax_xla.DeviceArray, np.ndarray))
+                else v
+                for k, v in inputs_dict.items()
+            }
+
+        return inputs_dict
+
+    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
+        diff = np.abs((a - b)).max()
+        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assert_almost_equals(
+                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), 1e-5
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
+
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                # Flax models don't use the `use_cache` option and cache is not returned as a default.
+                # So we disable `use_cache` here for PyTorch model.
+                pt_model.config.use_cache = False
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
+
+    def test_from_pretrained_save_pretrained(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ != "FlaxBertModel":
+                continue
+
+            with self.subTest(model_class.__name__):
+                model = model_class(config)
+
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                outputs = model(**prepared_inputs_dict).to_tuple()
+
+                # verify that normal save_pretrained works as expected
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    model.save_pretrained(tmpdirname)
+                    model_loaded = model_class.from_pretrained(tmpdirname)
+
+                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()
+                for output_loaded, output in zip(outputs_loaded, outputs):
+                    self.assert_almost_equals(output_loaded, output, 1e-3)
+
+                # verify that save_pretrained for distributed training
+                # with `params=params` works as expected
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    model.save_pretrained(tmpdirname, params=model.params)
+                    model_loaded = model_class.from_pretrained(tmpdirname)
+
+                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()
+                for output_loaded, output in zip(outputs_loaded, outputs):
+                    self.assert_almost_equals(output_loaded, output, 1e-3)
+
+    @slow
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(input_ids, attention_mask=None, **kwargs):
+                    return model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            else:
+                expected_arg_names = ["input_ids", "attention_mask"]
+                self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_naming_convention(self):
+        for model_class in self.all_model_classes:
+            model_class_name = model_class.__name__
+            module_class_name = (
+                model_class_name[:-5] + "Module" if model_class_name[-5:] == "Model" else model_class_name + "Module"
+            )
+            bert_modeling_flax_module = __import__(model_class.__module__, fromlist=[module_class_name])
+            module_cls = getattr(bert_modeling_flax_module, module_class_name)
+
+            self.assertIsNotNone(module_cls)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_length)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # Question Answering model returns start_logits and end_logits
+                if model_class in get_values(FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        encoder_key_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
diff --git a/test_modeling_flax_electra.py b/test_modeling_flax_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e15f94402bb1601639451f9e1bd75dc489b23e4
--- /dev/null
+++ b/test_modeling_flax_electra.py
@@ -0,0 +1,133 @@
+import unittest
+
+import numpy as np
+
+from transformers import ElectraConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    from transformers.models.electra.modeling_flax_electra import (
+        FlaxElectraForMaskedLM,
+        FlaxElectraForMultipleChoice,
+        FlaxElectraForPreTraining,
+        FlaxElectraForQuestionAnswering,
+        FlaxElectraForSequenceClassification,
+        FlaxElectraForTokenClassification,
+        FlaxElectraModel,
+    )
+
+
+class FlaxElectraModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_attention_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        embedding_size=24,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_choices = num_choices
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = ElectraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            embedding_size=self.embedding_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, attention_mask
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            FlaxElectraModel,
+            FlaxElectraForMaskedLM,
+            FlaxElectraForPreTraining,
+            FlaxElectraForTokenClassification,
+            FlaxElectraForQuestionAnswering,
+            FlaxElectraForMultipleChoice,
+            FlaxElectraForSequenceClassification,
+        )
+        if is_flax_available()
+        else ()
+    )
+
+    def setUp(self):
+        self.model_tester = FlaxElectraModelTester(self)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            if model_class_name == FlaxElectraForMaskedLM:
+                model = model_class_name.from_pretrained("google/electra-small-generator")
+            else:
+                model = model_class_name.from_pretrained("google/electra-small-discriminator")
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
diff --git a/test_modeling_flax_gpt2.py b/test_modeling_flax_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c79fc5ef352bd94bd1f5bca1d0d91ccd6f50857a
--- /dev/null
+++ b/test_modeling_flax_gpt2.py
@@ -0,0 +1,325 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import tempfile
+import unittest
+
+import numpy as np
+
+import transformers
+from transformers import GPT2Config, GPT2Tokenizer, is_flax_available, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+
+from .test_generation_flax_utils import FlaxGenerationTesterMixin
+from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+    from transformers.models.gpt2.modeling_flax_gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model
+
+if is_torch_available():
+    import torch
+
+
+class FlaxGPT2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def prepare_config_and_inputs(self, gradient_checkpointing=False):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = GPT2Config(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            use_cache=False,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+
+        return (config, input_ids, input_mask)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
+        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
+
+        position_ids = jnp.broadcast_to(
+            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
+        )
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            attention_mask=attention_mask,
+            past_key_values=outputs_cache.past_key_values,
+            position_ids=position_ids,
+        )
+
+        outputs = model(input_ids)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        attention_mask_cache = jnp.concatenate(
+            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
+            axis=-1,
+        )
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
+        position_ids = jnp.broadcast_to(
+            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
+        )
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask_cache,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            past_key_values=outputs_cache.past_key_values,
+            attention_mask=attention_mask_cache,
+            position_ids=position_ids,
+        )
+
+        outputs = model(input_ids, attention_mask=attention_mask)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+
+@require_flax
+class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
+    all_generative_model_classes = (FlaxGPT2LMHeadModel,) if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxGPT2ModelTester(self)
+
+    def test_use_cache_forward(self):
+        for model_class_name in self.all_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
+
+    def test_use_cache_forward_with_attn_mask(self):
+        for model_class_name in self.all_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_use_cache_forward_with_attn_mask(
+                model_class_name, config, input_ids, attention_mask
+            )
+
+    @slow
+    def test_batch_generation(self):
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token="</s>", padding_side="left")
+        inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="jax", padding=True, truncation=True)
+
+        model = FlaxGPT2LMHeadModel.from_pretrained("gpt2")
+        model.do_sample = False
+        model.config.pad_token_id = model.config.eos_token_id
+
+        jit_generate = jax.jit(model.generate)
+
+        output_sequences = jit_generate(inputs["input_ids"], attention_mask=inputs["attention_mask"]).sequences
+
+        output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
+
+        expected_string = [
+            "Hello this is a long string of words. I'm going to try to explain what I mean.",
+            "Hey, I'm not sure if I'm going to be able to do",
+        ]
+
+        self.assertListEqual(output_string, expected_string)
+
+    # overwrite from common since `attention_mask` in combination
+    # with `causal_mask` behaves slighly differently
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                batch_size, seq_length = pt_inputs["input_ids"].shape
+                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                for batch_idx, start_index in enumerate(rnd_start_indices):
+                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    # overwrite from common since `attention_mask` in combination
+    # with `causal_mask` behaves slighly differently
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+                batch_size, seq_length = pt_inputs["input_ids"].shape
+                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                for batch_idx, start_index in enumerate(rnd_start_indices):
+                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("gpt2", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
diff --git a/test_modeling_flax_roberta.py b/test_modeling_flax_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..8671a39e1e7b4d20747021f2aa78fde7f2bf705a
--- /dev/null
+++ b/test_modeling_flax_roberta.py
@@ -0,0 +1,140 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import RobertaConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    from transformers.models.roberta.modeling_flax_roberta import (
+        FlaxRobertaForMaskedLM,
+        FlaxRobertaForMultipleChoice,
+        FlaxRobertaForQuestionAnswering,
+        FlaxRobertaForSequenceClassification,
+        FlaxRobertaForTokenClassification,
+        FlaxRobertaModel,
+    )
+
+
+class FlaxRobertaModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_attention_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_choices = num_choices
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = RobertaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, attention_mask
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxRobertaModelTest(FlaxModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            FlaxRobertaModel,
+            FlaxRobertaForMaskedLM,
+            FlaxRobertaForSequenceClassification,
+            FlaxRobertaForTokenClassification,
+            FlaxRobertaForMultipleChoice,
+            FlaxRobertaForQuestionAnswering,
+        )
+        if is_flax_available()
+        else ()
+    )
+
+    def setUp(self):
+        self.model_tester = FlaxRobertaModelTester(self)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("roberta-base", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
diff --git a/test_modeling_flax_vit.py b/test_modeling_flax_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..f745d7c7ffa63d00938ca01b5141bd3eae2a4609
--- /dev/null
+++ b/test_modeling_flax_vit.py
@@ -0,0 +1,241 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import unittest
+
+import numpy as np
+
+from transformers import ViTConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor
+
+
+if is_flax_available():
+
+    import jax
+    from transformers.models.vit.modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel
+
+
+class FlaxViTModelTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        config = ViTConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, pixel_values
+
+    def create_and_check_model(self, config, pixel_values, labels):
+
+        model = FlaxViTModel(config=config)
+        result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_flax
+class FlaxViTModelTest(FlaxModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (FlaxViTModel, FlaxViTForImageClassification) if is_flax_available() else ()
+
+    def setUp(self) -> None:
+        self.model_tester = FlaxViTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # We need to override this test because in ViT, the seq_len equals the number of patches + 1
+    # we compute that here
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        num_patches = (config.image_size // config.patch_size) ** 2
+        seq_length = num_patches + 1
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+
+    # We neeed to override this test because ViT's forward signature is different than text models.
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    # We neeed to override this test because ViT expects pixel_values instead of input_ids
+    @slow
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+
+                @jax.jit
+                def model_jitted(pixel_values, **kwargs):
+                    return model(pixel_values=pixel_values, **kwargs)
+
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict).to_tuple()
+
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+
+    # We need to override this test because in ViT, the seq_len equals the number of patches + 1
+    # we compute that here
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            num_patches = (config.image_size // config.patch_size) ** 2
+            seq_length = num_patches + 1  # we add 1 for the [CLS] token
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.hidden_states
+
+            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("google/vit-base-patch16-224")
+            outputs = model(np.ones((1, 3, 224, 224)))
+            self.assertIsNotNone(outputs)
diff --git a/test_modeling_fsmt.py b/test_modeling_fsmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3ba4a1e80e163963ad3f476a0e258c6a85f230
--- /dev/null
+++ b/test_modeling_fsmt.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import timeout_decorator  # noqa
+
+from parameterized import parameterized
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer
+    from transformers.models.fsmt.modeling_fsmt import (
+        SinusoidalPositionalEmbedding,
+        _prepare_fsmt_decoder_inputs,
+        invert_mask,
+        shift_tokens_right,
+    )
+    from transformers.pipelines import TranslationPipeline
+
+
+@require_torch
+class ModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.src_vocab_size = 99
+        self.tgt_vocab_size = 99
+        self.langs = ["ru", "en"]
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = False
+        self.use_labels = False
+        self.hidden_size = 16
+        self.num_hidden_layers = 2
+        self.num_attention_heads = 4
+        self.intermediate_size = 4
+        self.hidden_act = "relu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 20
+        self.bos_token_id = 0
+        self.pad_token_id = 1
+        self.eos_token_id = 2
+        torch.manual_seed(0)
+
+        # hack needed for modeling_common tests - despite not really having this attribute in this model
+        self.vocab_size = self.src_vocab_size
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.src_vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = 2  # Eos Token
+
+        config = FSMTConfig(
+            vocab_size=self.src_vocab_size,  # hack needed for common tests
+            src_vocab_size=self.src_vocab_size,
+            tgt_vocab_size=self.tgt_vocab_size,
+            langs=self.langs,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_fsmt_inputs_dict(config, input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
+        inputs_dict["decoder_attention_mask"] = inputs_dict["attention_mask"]
+        inputs_dict["use_cache"] = False
+        return config, inputs_dict
+
+
+def prepare_fsmt_inputs_dict(
+    config,
+    input_ids,
+    attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+    }
+
+
+@require_torch
+class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (FSMTModel, FSMTForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (FSMTForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = ModelTester(self)
+        self.langs = ["en", "ru"]
+        config = {
+            "langs": self.langs,
+            "src_vocab_size": 10,
+            "tgt_vocab_size": 20,
+        }
+        # XXX: hack to appease to all other models requiring `vocab_size`
+        config["vocab_size"] = 99  # no such thing in FSMT
+        self.config_tester = ConfigTester(self, config_class=FSMTConfig, **config)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # XXX: override test_model_common_attributes / different Embedding type
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding))
+            model.set_input_embeddings(nn.Embedding(10, 10))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.modules.sparse.Embedding))
+
+    def test_initialization_more(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        model = FSMTModel(config)
+        model.to(torch_device)
+        model.eval()
+        # test init
+        # self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item())
+
+        def _check_var(module):
+            """Check that we initialized various parameters from N(0, config.init_std)."""
+            self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2)
+
+        _check_var(model.encoder.embed_tokens)
+        _check_var(model.encoder.layers[0].self_attn.k_proj)
+        _check_var(model.encoder.layers[0].fc1)
+        # XXX: different std for fairseq version of SinusoidalPositionalEmbedding
+        # self.assertAlmostEqual(torch.std(model.encoder.embed_positions.weights).item(), config.init_std, 2)
+
+    def test_advanced_inputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config.use_cache = False
+        inputs_dict["input_ids"][:, -2:] = config.pad_token_id
+        decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs(
+            config, inputs_dict["input_ids"]
+        )
+        model = FSMTModel(config).to(torch_device).eval()
+
+        decoder_features_with_created_mask = model(**inputs_dict)[0]
+        decoder_features_with_passed_mask = model(
+            decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict
+        )[0]
+        _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask)
+        useless_mask = torch.zeros_like(decoder_attn_mask)
+        decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0]
+        self.assertTrue(isinstance(decoder_features, torch.Tensor))  # no hidden states or attentions
+        self.assertEqual(
+            decoder_features.size(),
+            (self.model_tester.batch_size, self.model_tester.seq_length, config.tgt_vocab_size),
+        )
+        if decoder_attn_mask.min().item() < -1e3:  # some tokens were masked
+            self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item())
+
+        # Test different encoder attention masks
+        decoder_features_with_long_encoder_mask = model(
+            inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long()
+        )[0]
+        _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
+
+    def test_save_load_missing_keys(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    def test_export_to_onnx(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        model = FSMTModel(config).to(torch_device)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.onnx.export(
+                model,
+                (inputs_dict["input_ids"], inputs_dict["attention_mask"]),
+                f"{tmpdirname}/fsmt_test.onnx",
+                export_params=True,
+                opset_version=12,
+                input_names=["input_ids", "attention_mask"],
+            )
+
+    @unittest.skip("can't be implemented for FSMT due to dual vocab.")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @unittest.skip("Passing inputs_embeds not implemented for FSMT.")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("model weights aren't tied in FSMT.")
+    def test_tie_model_weights(self):
+        pass
+
+    @unittest.skip("TODO: Decoder embeddings cannot be resized at the moment")
+    def test_resize_embeddings_untied(self):
+        pass
+
+
+@require_torch
+class FSMTHeadTests(unittest.TestCase):
+    src_vocab_size = 99
+    tgt_vocab_size = 99
+    langs = ["ru", "en"]
+
+    def _get_config(self):
+        return FSMTConfig(
+            src_vocab_size=self.src_vocab_size,
+            tgt_vocab_size=self.tgt_vocab_size,
+            langs=self.langs,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+        )
+
+    def _get_config_and_data(self):
+        input_ids = torch.tensor(
+            [
+                [71, 82, 18, 33, 46, 91, 2],
+                [68, 34, 26, 58, 30, 82, 2],
+                [5, 97, 17, 39, 94, 40, 2],
+                [76, 83, 94, 25, 70, 78, 2],
+                [87, 59, 41, 35, 48, 66, 2],
+                [55, 13, 16, 58, 5, 2, 1],  # note padding
+                [64, 27, 31, 51, 12, 75, 2],
+                [52, 64, 86, 17, 83, 39, 2],
+                [48, 61, 9, 24, 71, 82, 2],
+                [26, 1, 60, 48, 22, 13, 2],
+                [21, 5, 62, 28, 14, 76, 2],
+                [45, 98, 37, 86, 59, 48, 2],
+                [70, 70, 50, 9, 28, 0, 2],
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+
+        batch_size = input_ids.shape[0]
+        config = self._get_config()
+        return config, input_ids, batch_size
+
+    def test_generate_beam_search(self):
+        input_ids = torch.tensor([[71, 82, 2], [68, 34, 2]], dtype=torch.long, device=torch_device)
+        config = self._get_config()
+        lm_model = FSMTForConditionalGeneration(config).to(torch_device)
+        lm_model.eval()
+
+        max_length = 5
+        new_input_ids = lm_model.generate(
+            input_ids.clone(),
+            do_sample=True,
+            num_return_sequences=1,
+            num_beams=2,
+            no_repeat_ngram_size=3,
+            max_length=max_length,
+        )
+        self.assertEqual(new_input_ids.shape, (input_ids.shape[0], max_length))
+
+    def test_shift_tokens_right(self):
+        input_ids = torch.tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=torch.long)
+        shifted = shift_tokens_right(input_ids, 1)
+        n_pad_before = input_ids.eq(1).float().sum()
+        n_pad_after = shifted.eq(1).float().sum()
+        self.assertEqual(shifted.shape, input_ids.shape)
+        self.assertEqual(n_pad_after, n_pad_before - 1)
+        self.assertTrue(torch.eq(shifted[:, 0], 2).all())
+
+    def test_generate_fp16(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = FSMTForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_dummy_inputs(self):
+        config, *_ = self._get_config_and_data()
+        model = FSMTForConditionalGeneration(config).eval().to(torch_device)
+        model(**model.dummy_inputs)
+
+    def test_prepare_fsmt_decoder_inputs(self):
+        config, *_ = self._get_config_and_data()
+        input_ids = _long_tensor(([4, 4, 2]))
+        decoder_input_ids = _long_tensor([[26388, 2, config.pad_token_id]])
+        ignore = float("-inf")
+        decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs(
+            config, input_ids, decoder_input_ids
+        )
+        expected_causal_mask = torch.tensor(
+            [[0, ignore, ignore], [0, 0, ignore], [0, 0, 0]]  # never attend to the final token, because its pad
+        ).to(input_ids.device)
+        self.assertEqual(decoder_attn_mask.size(), decoder_input_ids.size())
+        self.assertTrue(torch.eq(expected_causal_mask, causal_mask).all())
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+TOLERANCE = 1e-4
+
+
+pairs = [
+    ["en-ru"],
+    ["ru-en"],
+    ["en-de"],
+    ["de-en"],
+]
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class FSMTModelIntegrationTests(unittest.TestCase):
+    tokenizers_cache = {}
+    models_cache = {}
+    default_mname = "facebook/wmt19-en-ru"
+
+    @cached_property
+    def default_tokenizer(self):
+        return self.get_tokenizer(self.default_mname)
+
+    @cached_property
+    def default_model(self):
+        return self.get_model(self.default_mname)
+
+    def get_tokenizer(self, mname):
+        if mname not in self.tokenizers_cache:
+            self.tokenizers_cache[mname] = FSMTTokenizer.from_pretrained(mname)
+        return self.tokenizers_cache[mname]
+
+    def get_model(self, mname):
+        if mname not in self.models_cache:
+            self.models_cache[mname] = FSMTForConditionalGeneration.from_pretrained(mname).to(torch_device)
+            if torch_device == "cuda":
+                self.models_cache[mname].half()
+        return self.models_cache[mname]
+
+    @slow
+    def test_inference_no_head(self):
+        tokenizer = self.default_tokenizer
+        model = FSMTModel.from_pretrained(self.default_mname).to(torch_device)
+
+        src_text = "My friend computer will translate this for me"
+        input_ids = tokenizer([src_text], return_tensors="pt")["input_ids"]
+        input_ids = _long_tensor(input_ids).to(torch_device)
+        inputs_dict = prepare_fsmt_inputs_dict(model.config, input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 10, model.config.tgt_vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # expected numbers were generated when en-ru model, using just fairseq's model4.pt
+        # may have to adjust if switched to a different checkpoint
+        expected_slice = torch.tensor(
+            [[-1.5753, -1.5753, 2.8975], [-0.9540, -0.9540, 1.0299], [-3.3131, -3.3131, 0.5219]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def translation_setup(self, pair):
+        text = {
+            "en": "Machine learning is great, isn't it?",
+            "ru": "Машинное обучение - это здорово, не так ли?",
+            "de": "Maschinelles Lernen ist großartig, oder?",
+        }
+
+        src, tgt = pair.split("-")
+        print(f"Testing {src} -> {tgt}")
+        mname = f"facebook/wmt19-{pair}"
+
+        src_text = text[src]
+        tgt_text = text[tgt]
+
+        tokenizer = self.get_tokenizer(mname)
+        model = self.get_model(mname)
+        return tokenizer, model, src_text, tgt_text
+
+    @parameterized.expand(pairs)
+    @slow
+    def test_translation_direct(self, pair):
+        tokenizer, model, src_text, tgt_text = self.translation_setup(pair)
+
+        input_ids = tokenizer.encode(src_text, return_tensors="pt").to(torch_device)
+
+        outputs = model.generate(input_ids)
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        assert decoded == tgt_text, f"\n\ngot: {decoded}\nexp: {tgt_text}\n"
+
+    @parameterized.expand(pairs)
+    @slow
+    def test_translation_pipeline(self, pair):
+        tokenizer, model, src_text, tgt_text = self.translation_setup(pair)
+        device = 0 if torch_device == "cuda" else -1
+        pipeline = TranslationPipeline(model, tokenizer, framework="pt", device=device)
+        output = pipeline([src_text])
+        self.assertEqual([tgt_text], [x["translation_text"] for x in output])
+
+
+@require_torch
+class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
+    padding_idx = 1
+    tolerance = 1e-4
+
+    def test_basic(self):
+        input_ids = torch.tensor([[4, 10]], dtype=torch.long, device=torch_device)
+        emb1 = SinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6, padding_idx=self.padding_idx).to(
+            torch_device
+        )
+        emb = emb1(input_ids)
+        desired_weights = torch.tensor(
+            [
+                [9.0930e-01, 1.9999e-02, 2.0000e-04, -4.1615e-01, 9.9980e-01, 1.0000e00],
+                [1.4112e-01, 2.9995e-02, 3.0000e-04, -9.8999e-01, 9.9955e-01, 1.0000e00],
+            ]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(emb[0], desired_weights, atol=self.tolerance),
+            msg=f"\nexp:\n{desired_weights}\ngot:\n{emb[0]}\n",
+        )
+
+    def test_odd_embed_dim(self):
+        # odd embedding_dim  is allowed
+        SinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=self.padding_idx).to(torch_device)
+
+        # odd num_embeddings is allowed
+        SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to(torch_device)
+
+    @unittest.skip("different from marian (needs more research)")
+    def test_positional_emb_weights_against_marian(self):
+
+        desired_weights = torch.tensor(
+            [
+                [0, 0, 0, 0, 0],
+                [0.84147096, 0.82177866, 0.80180490, 0.78165019, 0.76140374],
+                [0.90929741, 0.93651021, 0.95829457, 0.97505713, 0.98720258],
+            ]
+        )
+        emb1 = SinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512, padding_idx=self.padding_idx).to(
+            torch_device
+        )
+        weights = emb1.weights.data[:3, :5]
+        # XXX: only the 1st and 3rd lines match - this is testing against
+        # verbatim copy of SinusoidalPositionalEmbedding from fairseq
+        self.assertTrue(
+            torch.allclose(weights, desired_weights, atol=self.tolerance),
+            msg=f"\nexp:\n{desired_weights}\ngot:\n{weights}\n",
+        )
+
+        # test that forward pass is just a lookup, there is no ignore padding logic
+        input_ids = torch.tensor(
+            [[4, 10, self.padding_idx, self.padding_idx, self.padding_idx]], dtype=torch.long, device=torch_device
+        )
+        no_cache_pad_zero = emb1(input_ids)[0]
+        # XXX: only the 1st line matches the 3rd
+        self.assertTrue(
+            torch.allclose(torch.tensor(desired_weights, device=torch_device), no_cache_pad_zero[:3, :5], atol=1e-3)
+        )
diff --git a/test_modeling_funnel.py b/test_modeling_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f8f7bf0e59a9d27284c604950f82514e77fcd1
--- /dev/null
+++ b/test_modeling_funnel.py
@@ -0,0 +1,508 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import FunnelTokenizer, is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        FunnelBaseModel,
+        FunnelConfig,
+        FunnelForMaskedLM,
+        FunnelForMultipleChoice,
+        FunnelForPreTraining,
+        FunnelForQuestionAnswering,
+        FunnelForSequenceClassification,
+        FunnelForTokenClassification,
+        FunnelModel,
+    )
+
+
+class FunnelModelTester:
+    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester"""
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        block_sizes=[1, 1, 2],
+        num_decoder_layers=1,
+        d_model=32,
+        n_head=4,
+        d_head=8,
+        d_inner=37,
+        hidden_act="gelu_new",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=3,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        base=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.block_sizes = block_sizes
+        self.num_decoder_layers = num_decoder_layers
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = 2
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+        # Used in the tests to check the size of the first attention layer
+        self.num_attention_heads = n_head
+        # Used in the tests to check the size of the first hidden state
+        self.hidden_size = self.d_model
+        # Used in the tests to check the number of output hidden states/attentions
+        self.num_hidden_layers = sum(self.block_sizes) + (0 if base else self.num_decoder_layers)
+        # FunnelModel adds two hidden layers: input embeddings and the sum of the upsampled encoder hidden state with
+        # the last hidden state of the first block (which is the first hidden state of the decoder).
+        if not base:
+            self.expected_num_hidden_layers = self.num_hidden_layers + 2
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
+
+        config = FunnelConfig(
+            vocab_size=self.vocab_size,
+            block_sizes=self.block_sizes,
+            num_decoder_layers=self.num_decoder_layers,
+            d_model=self.d_model,
+            n_head=self.n_head,
+            d_head=self.d_head,
+            d_inner=self.d_inner,
+            hidden_act=self.hidden_act,
+            hidden_dropout=self.hidden_dropout,
+            attention_dropout=self.attention_dropout,
+            activation_dropout=self.activation_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = FunnelModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
+
+        model.config.truncate_seq = False
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
+
+        model.config.separate_cls = False
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
+
+    def create_and_check_base_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = FunnelBaseModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
+
+        model.config.truncate_seq = False
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 3, self.d_model))
+
+        model.config.separate_cls = False
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
+
+    def create_and_check_for_pretraining(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = FunnelForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_masked_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = FunnelForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_sequence_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = FunnelForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_choices = self.num_choices
+        model = FunnelForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = FunnelForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_question_answering(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        fake_token_labels,
+    ):
+        model = FunnelForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
+    test_head_masking = False
+    test_pruning = False
+    all_model_classes = (
+        (
+            FunnelModel,
+            FunnelForMaskedLM,
+            FunnelForPreTraining,
+            FunnelForQuestionAnswering,
+            FunnelForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_sequence_classification_problem_types = True
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = FunnelModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+
+        for param in ["r_w_bias", "r_r_bias", "r_kernel", "r_s_bias", "seg_embed"]:
+            if hasattr(module, param) and getattr(module, param) is not None:
+                weight = getattr(module, param)
+                weight.data.fill_(3)
+
+
+@require_torch
+class FunnelBaseModelTest(ModelTesterMixin, unittest.TestCase):
+    test_head_masking = False
+    test_pruning = False
+    all_model_classes = (
+        (FunnelBaseModel, FunnelForMultipleChoice, FunnelForSequenceClassification) if is_torch_available() else ()
+    )
+
+    def setUp(self):
+        self.model_tester = FunnelModelTester(self, base=True)
+        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_base_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_base_model(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    # overwrite from test_modeling_common
+    def test_training(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            if model_class.__name__ == "FunnelBaseModel":
+                continue
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+
+        for param in ["r_w_bias", "r_r_bias", "r_kernel", "r_s_bias", "seg_embed"]:
+            if hasattr(module, param) and getattr(module, param) is not None:
+                weight = getattr(module, param)
+                weight.data.fill_(3)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class FunnelModelIntegrationTest(unittest.TestCase):
+    def test_inference_tiny_model(self):
+        batch_size = 13
+        sequence_length = 7
+        input_ids = torch.arange(0, batch_size * sequence_length).long().reshape(batch_size, sequence_length)
+        lengths = [0, 1, 2, 3, 4, 5, 6, 4, 1, 3, 5, 0, 1]
+        token_type_ids = torch.tensor([[2] + [0] * a + [1] * (sequence_length - a - 1) for a in lengths])
+
+        model = FunnelModel.from_pretrained("sgugger/funnel-random-tiny")
+        output = model(input_ids, token_type_ids=token_type_ids)[0].abs()
+
+        expected_output_sum = torch.tensor(2344.8352)
+        expected_output_mean = torch.tensor(0.8052)
+        self.assertTrue(torch.allclose(output.sum(), expected_output_sum, atol=1e-4))
+        self.assertTrue(torch.allclose(output.mean(), expected_output_mean, atol=1e-4))
+
+        attention_mask = torch.tensor([[1] * 7, [1] * 4 + [0] * 3] * 6 + [[0, 1, 1, 0, 0, 1, 1]])
+        output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0].abs()
+
+        expected_output_sum = torch.tensor(2343.8425)
+        expected_output_mean = torch.tensor(0.8049)
+        self.assertTrue(torch.allclose(output.sum(), expected_output_sum, atol=1e-4))
+        self.assertTrue(torch.allclose(output.mean(), expected_output_mean, atol=1e-4))
+
+    @slow
+    def test_inference_model(self):
+        tokenizer = FunnelTokenizer.from_pretrained("huggingface/funnel-small")
+        model = FunnelModel.from_pretrained("huggingface/funnel-small")
+        inputs = tokenizer("Hello! I am the Funnel Transformer model.", return_tensors="pt")
+        output = model(**inputs)[0]
+
+        expected_output_sum = torch.tensor(235.7246)
+        expected_output_mean = torch.tensor(0.0256)
+        self.assertTrue(torch.allclose(output.sum(), expected_output_sum, atol=1e-4))
+        self.assertTrue(torch.allclose(output.mean(), expected_output_mean, atol=1e-4))
diff --git a/test_modeling_gpt2.py b/test_modeling_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff00231b4aaace782b42226cbdc451dd735a1528
--- /dev/null
+++ b/test_modeling_gpt2.py
@@ -0,0 +1,693 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        GPT2Config,
+        GPT2DoubleHeadsModel,
+        GPT2ForSequenceClassification,
+        GPT2LMHeadModel,
+        GPT2Model,
+        GPT2Tokenizer,
+    )
+
+
+class GPT2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=True,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def get_large_model_config(self):
+        return GPT2Config.from_pretrained("gpt2")
+
+    def prepare_config_and_inputs(self, gradient_checkpointing=False):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = GPT2Config(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range,
+            use_cache=not gradient_checkpointing,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
+
+    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
+        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_gpt2_model_attention_mask_past(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        half_seq_length = self.seq_length // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_gpt2_model_past_large_inputs(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = GPT2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
+        )["last_hidden_state"]
+        output_from_past = model(
+            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
+        )["last_hidden_state"]
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2LMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_forward_and_backwards(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPT2LMHeadModel(config)
+        model.to(torch_device)
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def create_and_check_double_lm_head_model(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+    ):
+        model = GPT2DoubleHeadsModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "mc_token_ids": mc_token_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+            "labels": multiple_choice_inputs_ids,
+        }
+
+        result = model(**inputs)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
+        )
+        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_gpt2_for_sequence_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPT2ForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "head_mask": head_mask,
+        }
+
+        return config, inputs_dict
+
+
+@require_torch
+class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
+    all_parallelizable_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
+    fx_ready_model_classes = all_model_classes
+    test_missing_keys = False
+    test_model_parallel = True
+
+    # special case for DoubleHeads model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "GPT2DoubleHeadsModel":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["input_ids"] = inputs_dict["labels"]
+                inputs_dict["token_type_ids"] = inputs_dict["labels"]
+                inputs_dict["mc_token_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.num_choices),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["mc_labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = GPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
+
+    def test_gpt2_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
+
+    def test_gpt2_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
+
+    def test_gpt2_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs)
+
+    def test_gpt2_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_gpt2_double_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
+
+    def test_gpt2_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
+
+    def test_gpt2_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(gradient_checkpointing=True)
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+
+    @slow
+    def test_batch_generation(self):
+        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        model.to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+
+        tokenizer.padding_side = "left"
+
+        # Define PAD Token = EOS Token = 50256
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+
+        # use different length sentences to test batching
+        sentences = [
+            "Hello, my dog is a little",
+            "Today, I",
+        ]
+
+        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+        token_type_ids = torch.cat(
+            [
+                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
+                input_ids.new_full((input_ids.shape[0], 1), 500),
+            ],
+            dim=-1,
+        )
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+        )
+
+        outputs_tt = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+            token_type_ids=token_type_ids,
+        )
+
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+        output_non_padded = model.generate(input_ids=inputs_non_padded)
+
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+        expected_output_sentence = [
+            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
+            "Today, I'm going to be doing a lot of research on this. I",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+    @slow
+    def test_batch_generation_2heads(self):
+        model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
+        model.to(torch_device)
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+
+        tokenizer.padding_side = "left"
+
+        # This tokenizer has no pad token, so we have to set it in some way
+        # Define PAD Token = EOS Token = 50256
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+
+        # use different length sentences to test batching
+        sentences = [
+            "Hello, my dog is a little",
+            "Today, I",
+        ]
+
+        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+        token_type_ids = torch.cat(
+            [
+                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
+                input_ids.new_full((input_ids.shape[0], 1), 500),
+            ],
+            dim=-1,
+        )
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+        )
+
+        outputs_tt = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+            token_type_ids=token_type_ids,
+        )
+
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+        output_non_padded = model.generate(input_ids=inputs_non_padded)
+
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+        expected_output_sentence = [
+            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
+            "Today, I'm going to be doing a lot of research on this. I",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = GPT2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class GPT2ModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_gpt2(self):
+        for checkpointing in [True, False]:
+            model = GPT2LMHeadModel.from_pretrained("gpt2", gradient_checkpointing=checkpointing)
+            model.to(torch_device)
+            input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)  # The dog
+            expected_output_ids = [
+                464,
+                3290,
+                373,
+                1043,
+                287,
+                257,
+                2214,
+                1474,
+                262,
+                16246,
+                286,
+                2688,
+                290,
+                2688,
+                27262,
+                13,
+                198,
+                198,
+                464,
+                3290,
+            ]  # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+            output_ids = model.generate(input_ids, do_sample=False)
+            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
+
+    @slow
+    def test_gpt2_sample(self):
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        model.to(torch_device)
+
+        torch.manual_seed(0)
+        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
+        input_ids = tokenized.input_ids.to(torch_device)
+        output_ids = model.generate(input_ids, do_sample=True)
+        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+        token_type_ids = tokenized.token_type_ids.to(torch_device)
+        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
+        output_seq_tt = model.generate(
+            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
+        )
+        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
+        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
+
+        EXPECTED_OUTPUT_STR = (
+            "Today is a nice day and if you don't know anything about the state of play during your holiday"
+        )
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+        self.assertTrue(
+            all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+        )  # token_type_ids should change output
+
+    @slow
+    def test_gpt2_sample_max_time(self):
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        model.to(torch_device)
+
+        torch.manual_seed(0)
+        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
+        input_ids = tokenized.input_ids.to(torch_device)
+
+        MAX_TIME = 0.5
+
+        start = datetime.datetime.now()
+        model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
+        duration = datetime.datetime.now() - start
+        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+        start = datetime.datetime.now()
+        model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
+        duration = datetime.datetime.now() - start
+        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+        start = datetime.datetime.now()
+        model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
+        duration = datetime.datetime.now() - start
+        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+        start = datetime.datetime.now()
+        model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
+        duration = datetime.datetime.now() - start
+        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+        start = datetime.datetime.now()
+        model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
+        duration = datetime.datetime.now() - start
+        self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
diff --git a/test_modeling_gpt_neo.py b/test_modeling_gpt_neo.py
new file mode 100644
index 0000000000000000000000000000000000000000..dab9f02c580c4f08e94e492da40768478ee1af9c
--- /dev/null
+++ b/test_modeling_gpt_neo.py
@@ -0,0 +1,662 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch GPT Neo model. """
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
+        GPT2Tokenizer,
+        GPTNeoConfig,
+        GPTNeoForCausalLM,
+        GPTNeoForSequenceClassification,
+        GPTNeoModel,
+    )
+    from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoAttentionMixin
+
+
+class GPTNeoModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=True,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=4,
+        attention_types=[[["global", "local"], 2]],
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        window_size=7,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.window_size = window_size
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+        self.chunk_length = window_size
+        self.attention_types = attention_types
+
+    def get_large_model_config(self):
+        return GPTNeoConfig.from_pretrained("gpt_neo")
+
+    def prepare_config_and_inputs(self, gradient_checkpointing=False):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = GPTNeoConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            max_position_embeddings=self.max_position_embeddings,
+            use_cache=not gradient_checkpointing,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            gradient_checkpointing=gradient_checkpointing,
+            window_size=self.window_size,
+            attention_types=self.attention_types,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_gpt_neo_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTNeoModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        # past_key_values is not implemented
+        # self.parent.assertEqual(len(result.past_key_values), config.n_layer)
+
+    def create_and_check_gpt_neo_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTNeoModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
+        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTNeoForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_gpt_neo_for_sequence_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        model = GPTNeoForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_forward_and_backwards(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = GPTNeoForCausalLM(config)
+        model.to(torch_device)
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "head_mask": head_mask,
+        }
+
+        return config, inputs_dict
+
+
+@require_torch
+class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (GPTNeoForCausalLM,) if is_torch_available() else ()
+    fx_ready_model_classes = all_model_classes
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+
+    # special case for DoubleHeads model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = GPTNeoModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPTNeoConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_gpt_neo_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_neo_model(*config_and_inputs)
+
+    def test_gpt_neo_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_neo_model_past(*config_and_inputs)
+
+    def test_gpt_neo_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_gpt_neo_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt_neo_for_sequence_classification(*config_and_inputs)
+
+    def test_gpt_neo_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(gradient_checkpointing=True)
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+
+    def _get_local_attn_seq_len_block_len_windows(self, seq_len, window_size):
+        block_length = window_size
+        while seq_len % block_length != 0:
+            block_length -= 1
+        windows = seq_len // block_length
+        local_seq_len = window_size + block_length
+        return local_seq_len, block_length, windows
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # test global attention shape
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, seq_len],
+            )
+            # test local attention shape
+            encoder_key_length = self._get_local_attn_seq_len_block_len_windows(seq_len, chunk_length)[0]
+            self.assertListEqual(
+                list(attentions[-1].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, encoder_key_length],
+            )
+
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+
+            # test global attention shape
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, seq_len],
+            )
+
+            # test local attention shape
+            self.assertListEqual(
+                list(self_attentions[-1].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, encoder_key_length],
+            )
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx if not use_cache else 1
+            src_len = min_length + idx
+            global_expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+
+            local_seq_len, block_len, windows = self._get_local_attn_seq_len_block_len_windows(
+                src_len, config.window_size
+            )
+            block_len = 1 if use_cache else block_len
+            local_expected_shape = (
+                batch_size * num_beam_groups,
+                windows,
+                config.num_attention_heads,
+                block_len,
+                local_seq_len,
+            )
+
+            shapes = [layer_attention.shape for layer_attention in iter_attentions]
+            # every other layer is local attention layers
+            # so alternate between expected shapes
+            expected_shape = [
+                global_expected_shape if i % 2 == 0 else local_expected_shape for i, _ in enumerate(iter_attentions)
+            ]
+            # check attn size
+            self.assertListEqual(shapes, expected_shape)
+
+
+@require_torch
+class GPTNeoLocalAttentionTest(unittest.TestCase):
+    def _get_hidden_states(self):
+        return torch.tensor(
+            [
+                [
+                    [0.4983, -0.7584, -1.6944, 0.5440],
+                    [2.6918, 0.4206, 0.4176, 0.2055],
+                    [-0.0071, -0.0405, -1.4920, -0.3630],
+                    [1.0492, 0.1599, -1.7648, 0.2419],
+                    [-1.8348, 2.0514, -0.1946, 0.3203],
+                    [0.7672, -1.1600, -1.7118, -0.9056],
+                    [0.2986, 0.5372, 0.7729, -0.1927],
+                    [0.0285, 0.2629, -1.1156, -1.1992],
+                ]
+            ],
+            dtype=torch.float32,
+            device=torch_device,
+        )
+
+    def test_look_back(self):
+        hidden_states = self._get_hidden_states()
+        batch_size, seq_length, hidden_size = hidden_states.shape
+
+        # check when seq_length is divisible by window_size
+        window_size = 4
+        block_length, num_block = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+        blocked_hidden_states = GPTNeoAttentionMixin._look_back(hidden_states, block_length, window_size)
+        expected_shape = [batch_size, num_block, window_size + block_length, hidden_size]
+        self.assertListEqual(list(blocked_hidden_states.shape), expected_shape)
+        # The last block should contain the last (window_size + block_length) hidden_states
+        self.assertTrue(
+            torch.all(blocked_hidden_states[:, -1, ...] == hidden_states[:, -(window_size + block_length) :, ...])
+        )
+
+        # check when seq_length is not divisible by window_size
+        window_size = 3
+        block_length, num_block = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+        blocked_hidden_states = GPTNeoAttentionMixin._look_back(hidden_states, block_length, window_size)
+        expected_shape = [batch_size, num_block, window_size + block_length, hidden_size]
+        self.assertListEqual(list(blocked_hidden_states.shape), expected_shape)
+        # The last block should contain the last (window_size + block_length) hidden_states
+        self.assertTrue(
+            torch.all(blocked_hidden_states[:, -1, ...] == hidden_states[:, -(window_size + block_length) :, ...])
+        )
+
+        # check when window_size is > seq_length
+        window_size = 19
+        block_length, num_block = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+        blocked_hidden_states = GPTNeoAttentionMixin._look_back(hidden_states, block_length, window_size)
+        expected_shape = [batch_size, num_block, window_size + block_length, hidden_size]
+        self.assertListEqual(list(blocked_hidden_states.shape), expected_shape)
+
+        # when window_size > seq_length, num_blocks becomes 1, in this case
+        # the first window_size values in blocked_hidden_staes are all zeros
+        # and the last block_length values are equal to the hidden_states
+        values = blocked_hidden_states[:, -1, :window_size, ...]
+        expected_values = torch.zeros_like(values)
+        self.assertTrue(torch.all(values == expected_values))
+
+        self.assertTrue(torch.all(blocked_hidden_states[:, -1, -block_length:, ...] == hidden_states))
+
+    def test_create_attention_mask(self):
+        config = GPTNeoConfig.from_pretrained("valhalla/gpt-neo-random-tiny")
+        window_size = config.window_size
+        batch_size, seq_length = 8, 1
+        block_length, num_blocks = GPTNeoAttentionMixin._get_block_length_and_num_blocks(seq_length, window_size)
+
+        # causal_mask = layer._create_attention_mask(batch_size, seq_length, num_blocks, block_length, torch_device)
+        causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
+            batch_size, seq_length, config.window_size, torch_device
+        )
+        # check shapes
+        expected_shape = [batch_size, num_blocks, 1, block_length, window_size + block_length]
+        self.assertListEqual(list(causal_mask.shape), expected_shape)
+        # first window_size tokens in the first block are always padded
+        # and should not be attended
+        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
+        # each window can attend at most window_size tokens
+        self.assertTrue(torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
+
+        # check if user provided attention_mask is handled correctly
+        attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=torch_device)
+        attention_mask[:, -3:] = 0  # don't attend last 3 tokens
+
+        # causal_mask = layer._create_attention_mask(
+        # batch_size, seq_length, num_blocks, block_length, torch_device, attention_mask
+        # )
+        causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
+            batch_size, seq_length, config.window_size, torch_device, attention_mask
+        )
+        # last 3 tokens will be in the last block and shoul have 0s in causal_mask
+        self.assertTrue(torch.all(causal_mask[:, -1, :, :, -3:] == 0))
+        # check shapes
+        expected_shape = [batch_size, num_blocks, 1, block_length, window_size + block_length]
+        self.assertListEqual(list(causal_mask.shape), expected_shape)
+        # first window_size tokens in the first block are always padded
+        # and should not be attended
+        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
+        # each window can attend at most window_size tokens
+        self.assertTrue(torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
+
+    def test_local_attn_probs(self):
+        model = GPTNeoModel.from_pretrained("valhalla/gpt-neo-random-tiny").eval()
+        layer = model.h[1].attn.attention.to(torch_device)
+        hidden_states = self._get_hidden_states()
+        hidden_states = torch.cat([hidden_states, hidden_states - 0.5], dim=2)
+        batch_size, seq_length, hidden_size = hidden_states.shape
+        mask_tokens = 3
+        attention_mask = torch.ones(batch_size, seq_length, device=torch_device, dtype=torch.long)
+        attention_mask[:, -mask_tokens:] = 0  # dont atten last mask_tokens
+        local_causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
+            batch_size, seq_length, model.config.window_size, torch_device, attention_mask
+        )
+
+        _, attn_probs = layer(hidden_states, attention_mask=local_causal_mask, output_attentions=True)
+
+        # the last 3 tokens will be in the last block, and should have 0 attn_probs
+        self.assertTrue(torch.all(attn_probs[:, -1, :, -mask_tokens:, -mask_tokens:] == 0))
+        # the first config.window_size tokens in the first block are always padded
+        # and should have 0 attn_probs
+        self.assertTrue(torch.all(attn_probs[:, 0, :, : model.config.window_size :, : model.config.window_size] == 0))
+
+
+@require_torch
+class GPTNeoModelLanguageGenerationTest(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").to(torch_device)
+
+    @cached_property
+    def tokenizer(self):
+        return GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
+
+    @slow
+    def test_lm_generate_gpt_neo(self):
+        for checkpointing in [True, False]:
+            model = self.model
+            model.config.gradient_checkpointing = checkpointing
+            input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)  # The dog
+            # fmt: off
+            # The dog-eared copy of the book, which is a collection of essays by the late author,
+            expected_output_ids = [464, 3290, 12, 3380, 4866, 286, 262, 1492, 11, 543, 318, 257, 4947, 286, 27126, 416, 262, 2739, 1772, 11]
+            # fmt: on
+            output_ids = model.generate(input_ids, do_sample=False)
+            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
+
+    @slow
+    def test_gpt_neo_sample(self):
+        model = self.model
+        tokenizer = self.tokenizer
+
+        torch.manual_seed(0)
+        tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
+        input_ids = tokenized.input_ids.to(torch_device)
+        output_ids = model.generate(input_ids, do_sample=True)
+        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+        EXPECTED_OUTPUT_STR = "Today is a nice day and if you don’t get the memo here is what you can"
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+
+    @slow
+    def test_batch_generation(self):
+        model = self.model
+        tokenizer = self.tokenizer
+
+        tokenizer.padding_side = "left"
+
+        # Define PAD Token = EOS Token = 50256
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+
+        # use different length sentences to test batching
+        sentences = [
+            "Hello, my dog is a little",
+            "Today, I am",
+        ]
+
+        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+        )
+
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+        output_non_padded = model.generate(input_ids=inputs_non_padded)
+
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+        expected_output_sentence = [
+            "Hello, my dog is a little bit of a kitty. She is a very sweet and loving",
+            "Today, I am going to talk about the best way to get a job in the",
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = GPTNeoModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
diff --git a/test_modeling_ibert.py b/test_modeling_ibert.py
new file mode 100755
index 0000000000000000000000000000000000000000..d0b672193cc18cd927244f7baf0e7def61708866
--- /dev/null
+++ b/test_modeling_ibert.py
@@ -0,0 +1,696 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        IBertConfig,
+        IBertForMaskedLM,
+        IBertForMultipleChoice,
+        IBertForQuestionAnswering,
+        IBertForSequenceClassification,
+        IBertForTokenClassification,
+        IBertModel,
+    )
+    from transformers.models.ibert.modeling_ibert import (
+        IBertEmbeddings,
+        IntGELU,
+        IntLayerNorm,
+        IntSoftmax,
+        QuantAct,
+        QuantEmbedding,
+        QuantLinear,
+        create_position_ids_from_input_ids,
+    )
+
+
+class IBertModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = IBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            quant_mode=True,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = IBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = IBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = IBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = IBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = IBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class IBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    test_pruning = False
+    test_torchscript = False
+    test_head_masking = False
+    test_resize_embeddings = False
+
+    all_model_classes = (
+        (
+            IBertForMaskedLM,
+            IBertModel,
+            IBertForSequenceClassification,
+            IBertForTokenClassification,
+            IBertForMultipleChoice,
+            IBertForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    def setUp(self):
+        self.model_tester = IBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=IBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        # I-BERT only supports absolute embedding
+        for type in ["absolute"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in IBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = IBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is IBertEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = IBertEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is IBertEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = IBertEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    # Override
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), QuantEmbedding)
+            model.set_input_embeddings(nn.Embedding(10, 10))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    # Override
+    def test_feed_forward_chunking(self):
+        pass  # I-BERT does not support chunking
+
+    # Override
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                embed, embed_scaling_factor = wte(input_ids)
+                inputs["inputs_embeds"] = embed
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+
+@require_torch
+class IBertModelIntegrationTest(unittest.TestCase):
+    def test_quant_embedding(self):
+        weight_bit = 8
+        embedding = QuantEmbedding(2, 4, quant_mode=True, weight_bit=weight_bit)
+        embedding_weight = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
+        embedding.weight = nn.Parameter(embedding_weight)
+
+        expected_scaling_factor = embedding_weight.abs().max() / (2 ** (weight_bit - 1) - 1)
+        x, x_scaling_factor = embedding(torch.tensor(0))
+        y, y_scaling_factor = embedding(torch.tensor(1))
+
+        # scaling factor should follow the symmetric quantization rule
+        self.assertTrue(torch.allclose(x_scaling_factor, expected_scaling_factor, atol=1e-4))
+        self.assertTrue(torch.allclose(x_scaling_factor, expected_scaling_factor, atol=1e-4))
+        self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
+
+        # quantization error should not exceed the scaling factor
+        self.assertTrue(torch.allclose(x, embedding_weight[0], atol=expected_scaling_factor))
+        self.assertTrue(torch.allclose(y, embedding_weight[1], atol=expected_scaling_factor))
+
+    def test_quant_act(self):
+        def _test_range():
+            act = QuantAct(activation_bit, act_range_momentum, quant_mode=True)
+
+            # First pass
+            x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
+            x_scaling_factor = torch.tensor(1.0)
+            y, y_scaling_factor = act(x, x_scaling_factor)
+            y_int = y / y_scaling_factor
+
+            # After the first pass, x_min and x_max should be initialized with x.min() and x.max()
+            expected_x_min, expected_x_max = x.min(), x.max()
+            self.assertTrue(torch.allclose(act.x_min, expected_x_min, atol=1e-4))
+            self.assertTrue(torch.allclose(act.x_max, expected_x_max, atol=1e-4))
+
+            # scaling factor should follow the symmetric quantization rule
+            expected_range = torch.max(expected_x_min.abs(), expected_x_max.abs())
+            expected_scaling_factor = expected_range / (2 ** (activation_bit - 1) - 1)
+            self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
+
+            # quantization error should not exceed the scaling factor
+            self.assertTrue(torch.allclose(x, y, atol=expected_scaling_factor))
+
+            # output should be integer
+            self.assertTrue(torch.allclose(y_int, y_int.round(), atol=1e-4))
+
+            # Second Pass
+            x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) * 2
+            x_scaling_factor = torch.tensor(1.0)
+            y, y_scaling_factor = act(x, x_scaling_factor)
+            y_int = y / y_scaling_factor
+
+            # From the second pass, x_min and x_max should be updated with moving average
+            expected_x_min = expected_x_min * act_range_momentum + x.min() * (1 - act_range_momentum)
+            expected_x_max = expected_x_max * act_range_momentum + x.max() * (1 - act_range_momentum)
+            self.assertTrue(torch.allclose(act.x_min, expected_x_min, atol=1e-4))
+            self.assertTrue(torch.allclose(act.x_max, expected_x_max, atol=1e-4))
+
+            # scaling factor should follow the symmetric quantization rule
+            expected_range = torch.max(expected_x_min.abs(), expected_x_max.abs())
+            expected_scaling_factor = expected_range / (2 ** (activation_bit - 1) - 1)
+            self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
+
+            # quantization error should not exceed the scaling factor
+            x = x.clamp(min=-expected_range, max=expected_range)
+            self.assertTrue(torch.allclose(x, y, atol=expected_scaling_factor))
+
+            # output should be integer
+            self.assertTrue(torch.allclose(y_int, y_int.round(), atol=1e-4))
+
+            # Third pass, with eval()
+            act.eval()
+            x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) * 3
+
+            # In eval mode, min/max and scaling factor must be fixed
+            self.assertTrue(torch.allclose(act.x_min, expected_x_min, atol=1e-4))
+            self.assertTrue(torch.allclose(act.x_max, expected_x_max, atol=1e-4))
+            self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
+
+        def _test_identity():
+            # test if identity and identity_scaling_factor are given
+            # should add the input values
+            act = QuantAct(activation_bit, act_range_momentum, quant_mode=True)
+            x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
+            y = torch.tensor([[6.0, -7.0, 1.0, -2.0], [3.0, -4.0, -8.0, 5.0]])
+            x_scaling_factor = torch.tensor(1.0)
+            y_scaling_factor = torch.tensor(0.5)
+            z, z_scaling_factor = act(x, x_scaling_factor, y, y_scaling_factor)
+            z_int = z / z_scaling_factor
+            self.assertTrue(torch.allclose(x + y, z, atol=0.1))
+            self.assertTrue(torch.allclose(z_int, z_int.round(), atol=1e-4))
+
+        activation_bit = 8
+        act_range_momentum = 0.95
+        _test_range()
+        _test_identity()
+
+    def test_quant_linear(self):
+        def _test(per_channel):
+            linear_q = QuantLinear(2, 4, quant_mode=True, per_channel=per_channel, weight_bit=weight_bit)
+            linear_dq = QuantLinear(2, 4, quant_mode=False, per_channel=per_channel, weight_bit=weight_bit)
+            linear_weight = torch.tensor([[-1.0, 2.0, 3.0, -4.0], [5.0, -6.0, -7.0, 8.0]]).T
+            linear_q.weight = nn.Parameter(linear_weight)
+            linear_dq.weight = nn.Parameter(linear_weight)
+
+            q, q_scaling_factor = linear_q(x, x_scaling_factor)
+            q_int = q / q_scaling_factor
+            dq, dq_scaling_factor = linear_dq(x, x_scaling_factor)
+
+            if per_channel:
+                q_max = linear_weight.abs().max(dim=1).values
+            else:
+                q_max = linear_weight.abs().max()
+            expected_scaling_factor = q_max / (2 ** (weight_bit - 1) - 1)
+
+            # scaling factor should follow the symmetric quantization rule
+            self.assertTrue(torch.allclose(linear_q.fc_scaling_factor, expected_scaling_factor, atol=1e-4))
+
+            # output of the normal linear layer and the quantized linear layer should be similar
+            self.assertTrue(torch.allclose(q, dq, atol=0.5))
+
+            # output of the quantized linear layer should be integer
+            self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4))
+
+        weight_bit = 8
+        x = torch.tensor([[2.0, -5.0], [-3.0, 4.0]])
+        x_scaling_factor = torch.tensor([1.0])
+        _test(True)
+        _test(False)
+
+    def test_int_gelu(self):
+        gelu_q = IntGELU(quant_mode=True)
+        gelu_dq = nn.GELU()
+
+        x_int = torch.range(-10000, 10000, 1)
+        x_scaling_factor = torch.tensor(0.001)
+        x = x_int * x_scaling_factor
+
+        q, q_scaling_factor = gelu_q(x, x_scaling_factor)
+        q_int = q / q_scaling_factor
+        dq = gelu_dq(x)
+
+        # output of the normal GELU and the quantized GELU should be similar
+        self.assertTrue(torch.allclose(q, dq, atol=0.5))
+
+        # output of the quantized GELU layer should be integer
+        self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4))
+
+    def test_force_dequant_gelu(self):
+        x_int = torch.range(-10000, 10000, 1)
+        x_scaling_factor = torch.tensor(0.001)
+        x = x_int * x_scaling_factor
+
+        gelu_dq = IntGELU(quant_mode=False)
+        gelu_fdqs_dict = {
+            True: [
+                IntGELU(quant_mode=True, force_dequant="nonlinear"),
+                IntGELU(quant_mode=True, force_dequant="gelu"),
+            ],
+            False: [
+                IntGELU(quant_mode=True, force_dequant="none"),
+                IntGELU(quant_mode=True, force_dequant="softmax"),
+                IntGELU(quant_mode=True, force_dequant="layernorm"),
+            ],
+        }
+
+        dq, dq_scaling_factor = gelu_dq(x, x_scaling_factor)
+        for label, gelu_fdqs in gelu_fdqs_dict.items():
+            for gelu_fdq in gelu_fdqs:
+                q, q_scaling_factor = gelu_fdq(x, x_scaling_factor)
+                if label:
+                    self.assertTrue(torch.allclose(q, dq, atol=1e-4))
+                else:
+                    self.assertFalse(torch.allclose(q, dq, atol=1e-4))
+
+    def test_int_softmax(self):
+        output_bit = 8
+        softmax_q = IntSoftmax(output_bit, quant_mode=True)
+        softmax_dq = nn.Softmax()
+
+        # x_int = torch.range(-10000, 10000, 1)
+        def _test(array):
+            x_int = torch.tensor(array)
+            x_scaling_factor = torch.tensor(0.1)
+            x = x_int * x_scaling_factor
+
+            q, q_scaling_factor = softmax_q(x, x_scaling_factor)
+            q_int = q / q_scaling_factor
+            dq = softmax_dq(x)
+
+            # output of the normal Softmax and the quantized Softmax should be similar
+            self.assertTrue(torch.allclose(q, dq, atol=0.5))
+
+            # output of the quantized GELU layer should be integer
+            self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4))
+
+            # Output of the quantize Softmax should not exceed the output_bit
+            self.assertTrue(q.abs().max() < 2 ** output_bit)
+
+        array = [[i + j for j in range(10)] for i in range(-10, 10)]
+        _test(array)
+        array = [[i + j for j in range(50)] for i in range(-10, 10)]
+        _test(array)
+        array = [[i + 100 * j for j in range(2)] for i in range(-10, 10)]
+        _test(array)
+
+    def test_force_dequant_softmax(self):
+        output_bit = 8
+        array = [[i + j for j in range(10)] for i in range(-10, 10)]
+        x_int = torch.tensor(array)
+        x_scaling_factor = torch.tensor(0.1)
+        x = x_int * x_scaling_factor
+
+        softmax_dq = IntSoftmax(output_bit, quant_mode=False)
+        softmax_fdqs_dict = {
+            True: [
+                IntSoftmax(output_bit, quant_mode=True, force_dequant="nonlinear"),
+                IntSoftmax(output_bit, quant_mode=True, force_dequant="softmax"),
+            ],
+            False: [
+                IntSoftmax(output_bit, quant_mode=True, force_dequant="none"),
+                IntSoftmax(output_bit, quant_mode=True, force_dequant="gelu"),
+                IntSoftmax(output_bit, quant_mode=True, force_dequant="layernorm"),
+            ],
+        }
+
+        dq, dq_scaling_factor = softmax_dq(x, x_scaling_factor)
+        for label, softmax_fdqs in softmax_fdqs_dict.items():
+            for softmax_fdq in softmax_fdqs:
+                q, q_scaling_factor = softmax_fdq(x, x_scaling_factor)
+                if label:
+                    self.assertTrue(torch.allclose(q, dq, atol=1e-4))
+                else:
+                    self.assertFalse(torch.allclose(q, dq, atol=1e-4))
+
+    def test_int_layernorm(self):
+        output_bit = 8
+
+        # some random matrix
+        array = [[[i * j * j + j for j in range(5, 15)]] for i in range(-10, 10)]
+        x_int = torch.tensor(array)
+        x_scaling_factor = torch.tensor(0.1)
+        x = x_int * x_scaling_factor
+
+        ln_q = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit)
+        ln_dq = nn.LayerNorm(x.shape[1:], 1e-5)
+
+        ln_q.weight = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_q.bias = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:]))
+
+        q, q_scaling_factor = ln_q(x, x_scaling_factor)
+        q_int = q / q_scaling_factor
+        dq = ln_dq(x)
+
+        # output of the normal LN and the quantized LN should be similar
+        self.assertTrue(torch.allclose(q, dq, atol=0.5))
+
+        # output of the quantized GELU layer should be integer
+        self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4))
+
+    def test_force_dequant_layernorm(self):
+        output_bit = 8
+        array = [[[i * j * j + j for j in range(5, 15)]] for i in range(-10, 10)]
+        x_int = torch.tensor(array)
+        x_scaling_factor = torch.tensor(0.1)
+        x = x_int * x_scaling_factor
+
+        ln_dq = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=False, output_bit=output_bit)
+        ln_fdqs_dict = {
+            True: [
+                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="nonlinear"),
+                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="layernorm"),
+            ],
+            False: [
+                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="none"),
+                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="gelu"),
+                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="softmax"),
+            ],
+        }
+
+        ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:]))
+        dq, dq_scaling_factor = ln_dq(x, x_scaling_factor)
+        for label, ln_fdqs in ln_fdqs_dict.items():
+            for ln_fdq in ln_fdqs:
+                ln_fdq.weight = nn.Parameter(torch.ones(x.shape[1:]))
+                ln_fdq.bias = nn.Parameter(torch.ones(x.shape[1:]))
+                q, q_scaling_factor = ln_fdq(x, x_scaling_factor)
+                if label:
+                    self.assertTrue(torch.allclose(q, dq, atol=1e-4))
+                else:
+                    self.assertFalse(torch.allclose(q, dq, atol=1e-4))
+
+    def quantize(self, model):
+        # Helper function that quantizes the given model
+        # Recursively convert all the `quant_mode` attributes as `True`
+        if hasattr(model, "quant_mode"):
+            model.quant_mode = True
+        elif type(model) == nn.Sequential:
+            for n, m in model.named_children():
+                self.quantize(m)
+        elif type(model) == nn.ModuleList:
+            for n in model:
+                self.quantize(n)
+        else:
+            for attr in dir(model):
+                mod = getattr(model, attr)
+                if isinstance(mod, nn.Module) and mod != model:
+                    self.quantize(mod)
+
+    @slow
+    def test_inference_masked_lm(self):
+        # I-BERT should be "equivalent" to RoBERTa if not quantized
+        # Test coped from `test_modeling_roberta.py`
+        model = IBertForMaskedLM.from_pretrained("kssteven/ibert-roberta-base")
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+        # I-BERT should be "similar" to RoBERTa if quantized
+        self.quantize(model)
+        output = model(input_ids)[0]
+        self.assertEqual(output.shape, expected_shape)
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=0.1))
+
+    @slow
+    def test_inference_classification_head(self):
+        # I-BERT should be "equivalent" to RoBERTa if not quantized
+        # Test coped from `test_modeling_roberta.py`
+        model = IBertForSequenceClassification.from_pretrained("kssteven/ibert-roberta-large-mnli")
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
+
+        # I-BERT should be "similar" to RoBERTa if quantized
+        self.quantize(model)
+        output = model(input_ids)[0]
+        self.assertEqual(output.shape, expected_shape)
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=0.1))
diff --git a/test_modeling_layoutlm.py b/test_modeling_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62d13e8fcc63ae58c3ce876e1eb66e2ddf9cdb2
--- /dev/null
+++ b/test_modeling_layoutlm.py
@@ -0,0 +1,336 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        LayoutLMConfig,
+        LayoutLMForMaskedLM,
+        LayoutLMForSequenceClassification,
+        LayoutLMForTokenClassification,
+        LayoutLMModel,
+    )
+
+
+class LayoutLMModelTester:
+    """You can also import this e.g from .test_modeling_layoutlm import LayoutLMModelTester"""
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        range_bbox=1000,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.range_bbox = range_bbox
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
+        # Ensure that bbox is legal
+        for i in range(bbox.shape[0]):
+            for j in range(bbox.shape[1]):
+                if bbox[i, j, 3] < bbox[i, j, 1]:
+                    t = bbox[i, j, 3]
+                    bbox[i, j, 3] = bbox[i, j, 1]
+                    bbox[i, j, 1] = t
+                if bbox[i, j, 2] < bbox[i, j, 0]:
+                    t = bbox[i, j, 2]
+                    bbox[i, j, 2] = bbox[i, j, 0]
+                    bbox[i, j, 0] = t
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = LayoutLMConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_model(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LayoutLMModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, bbox, token_type_ids=token_type_ids)
+        result = model(input_ids, bbox)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LayoutLMForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = LayoutLMForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = LayoutLMForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            bbox,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "bbox": bbox,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            LayoutLMModel,
+            LayoutLMForMaskedLM,
+            LayoutLMForSequenceClassification,
+            LayoutLMForTokenClassification,
+        )
+        if is_torch_available()
+        else None
+    )
+
+    def setUp(self):
+        self.model_tester = LayoutLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+
+def prepare_layoutlm_batch_inputs():
+    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
+    # fmt: off
+    input_ids = torch.tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]],device=torch_device)  # noqa: E231
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],],device=torch_device)  # noqa: E231
+    bbox = torch.tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]],device=torch_device)  # noqa: E231
+    token_type_ids = torch.tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]],device=torch_device)  # noqa: E231
+    # these are sequence labels (i.e. at the token level)
+    labels = torch.tensor([[-100,10,10,10,9,1,-100,7,7,-100,7,7,4,2,5,2,8,8,-100,-100,5,0,3,2,-100],[-100,12,12,12,-100,12,10,-100,-100,-100,-100,10,12,9,-100,-100,-100,10,10,10,9,12,-100,10,-100]],device=torch_device)  # noqa: E231
+    # fmt: on
+
+    return input_ids, attention_mask, bbox, token_type_ids, labels
+
+
+@require_torch
+class LayoutLMModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_forward_pass_no_head(self):
+        model = LayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased").to(torch_device)
+
+        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
+
+        # forward pass
+        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+
+        # test the sequence output on [0, :3, :3]
+        expected_slice = torch.tensor(
+            [[0.1785, -0.1947, -0.0425], [-0.3254, -0.2807, 0.2553], [-0.5391, -0.3322, 0.3364]],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-3))
+
+        # test the pooled output on [1, :3]
+        expected_slice = torch.tensor([-0.6580, -0.0214, 0.8552], device=torch_device)
+
+        self.assertTrue(torch.allclose(outputs.pooler_output[1, :3], expected_slice, atol=1e-3))
+
+    @slow
+    def test_forward_pass_sequence_classification(self):
+        # initialize model with randomly initialized sequence classification head
+        model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=2).to(
+            torch_device
+        )
+
+        input_ids, attention_mask, bbox, token_type_ids, _ = prepare_layoutlm_batch_inputs()
+
+        # forward pass
+        outputs = model(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=torch.tensor([1, 1], device=torch_device),
+        )
+
+        # test whether we get a loss as a scalar
+        loss = outputs.loss
+        expected_shape = torch.Size([])
+        self.assertEqual(loss.shape, expected_shape)
+
+        # test the shape of the logits
+        logits = outputs.logits
+        expected_shape = torch.Size((2, 2))
+        self.assertEqual(logits.shape, expected_shape)
+
+    @slow
+    def test_forward_pass_token_classification(self):
+        # initialize model with randomly initialized token classification head
+        model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=13).to(
+            torch_device
+        )
+
+        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
+
+        # forward pass
+        outputs = model(
+            input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels
+        )
+
+        # test the loss calculation to be around 2.65
+        # expected_loss = torch.tensor(2.65, device=torch_device)
+
+        # The loss is currently somewhat random and can vary between 0.1-0.3 atol.
+        # self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=0.1))
+
+        # test the shape of the logits
+        logits = outputs.logits
+        expected_shape = torch.Size((2, 25, 13))
+        self.assertEqual(logits.shape, expected_shape)
diff --git a/test_modeling_led.py b/test_modeling_led.py
new file mode 100644
index 0000000000000000000000000000000000000000..e507922762f159b3c4ca29c865b01b63e664a0fb
--- /dev/null
+++ b/test_modeling_led.py
@@ -0,0 +1,559 @@
+# coding=utf-8
+# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch LED model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        LEDConfig,
+        LEDForConditionalGeneration,
+        LEDForQuestionAnswering,
+        LEDForSequenceClassification,
+        LEDModel,
+        LEDTokenizer,
+    )
+    from transformers.models.led.modeling_led import LEDDecoder, LEDEncoder
+
+
+def prepare_led_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class LEDModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=11,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=32,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        attention_window=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.attention_window = attention_window
+
+        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
+        # [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
+        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
+        # because its local attention only attends to `self.attention_window + 1` locations
+        # (assuming no token with global attention, otherwise the last dimension of attentions
+        # is x + self.attention_window + 1, where x is the number of tokens with global attention)
+        # x is set to 1
+        self.encoder_key_length = self.attention_window + 2
+
+        # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
+        # the `test_attention_outputs` and `test_hidden_states_output` tests
+        self.encoder_seq_length = (
+            self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window
+        )
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = LEDConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            attention_window=self.attention_window,
+        )
+        inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        global_attention_mask = torch.zeros_like(inputs_dict["input_ids"])
+        global_attention_mask[:, -1] = 1
+        inputs_dict["global_attention_mask"] = global_attention_mask
+
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = LEDModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = LEDModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = LEDEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(
+            inputs_dict["input_ids"],
+            attention_mask=inputs_dict["attention_mask"],
+            global_attention_mask=inputs_dict["global_attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = LEDDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+    def check_global_attention(self, config, inputs_dict):
+        model = LEDModel(config=config).to(torch_device).eval()
+        model.config.output_attentions = True
+        attention_mask = ids_tensor(inputs_dict["input_ids"].shape, vocab_size=2)
+        global_attention_mask = torch.zeros_like(attention_mask)
+
+        # set some tokens to global_attention
+        num_tokens_with_global_attention = 2
+
+        attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1
+        global_attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1
+        inputs_dict["attention_mask"] = attention_mask
+        inputs_dict["global_attention_mask"] = global_attention_mask
+
+        outputs = model(**inputs_dict)
+        self.parent.assertIsNotNone(outputs.encoder_global_attentions)
+
+        # setting `num_tokens_with_global_attention` to global_attentions yields
+        # makes last dim to be of `num_tokens_with_global_attention`
+        self.parent.assertTrue(
+            outputs.encoder_global_attentions[0].shape,
+            (self.batch_size, self.num_attention_heads, self.encoder_seq_length, num_tokens_with_global_attention),
+        )
+
+
+@require_torch
+class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (LEDModel, LEDForConditionalGeneration, LEDForSequenceClassification, LEDForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (LEDForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = LEDModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LEDConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_global_attention(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_global_attention(*config_and_inputs)
+
+    # LEDForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (LEDModel, LEDForConditionalGeneration, LEDForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = LEDForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # longformer cannot keep gradients in attentions or hidden states
+        return
+
+    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
+        # make sure tgt_length is padded
+        tgt_length = (
+            seq_length // config.attention_window[0] + (seq_length % config.attention_window[0] != 0)
+        ) * config.attention_window[0]
+
+        encoder_expected_shape = (batch_size, config.num_attention_heads, tgt_length, seq_length)
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [layer_attentions.shape for layer_attentions in attentions],
+            [encoder_expected_shape] * len(attentions),
+        )
+
+    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
+        # make sure seq_length is padded
+        seq_length = (
+            seq_length // config.attention_window[0] + (seq_length % config.attention_window[0] != 0)
+        ) * config.attention_window[0]
+
+        encoder_expected_shape = (batch_size, seq_length, config.hidden_size)
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
+            [encoder_expected_shape] * len(hidden_states),
+        )
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_length = self.model_tester.seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+        encoder_key_length = self.model_tester.encoder_key_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            # global attention outputs are added as well => so +1 here
+            correct_outlen = 6
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            # Question Answering model returns start_logits and end_logits
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                correct_outlen += 1  # start_logits and end_logits instead of only 1 output
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    seq_length,
+                    seq_length,
+                ],
+            )
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class LEDModelIntegrationTests(unittest.TestCase):
+    """All the below results were obtained with the original checkpoints and code
+    base from https://github.com/allenai/longformer.
+    IMPORTANT: Note that the original checkpoints include a `postion_embeddings` "hack"
+    and have to be cut to have the correct shape.
+    See: https://github.com/huggingface/transformers/pull/9278#issue-544709661.
+    """
+
+    @cached_property
+    def default_tokenizer(self):
+        return LEDTokenizer.from_pretrained("allenai/led-base-16384")
+
+    def test_inference_no_head(self):
+        model = LEDModel.from_pretrained("allenai/led-base-16384").to(torch_device)
+
+        # change to intended input
+        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict).last_hidden_state
+        expected_shape = torch.Size((1, 1024, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").to(torch_device)
+
+        # change to intended input
+        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict, use_cache=False).logits
+        expected_shape = torch.Size((1, 1024, model.config.vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        # this test requires 16GB of RAM
+        hf = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv").to(torch_device)
+        tok = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
+
+        ARTICLE_LEP = """the lep experiments at the resonance of @xmath1-boson have tested the standard model ( sm ) at quantum level , measuring the @xmath1-decay into fermion pairs with an accuracy of one part in ten thousands . the good agreement of the lep data with the sm predictions have severely constrained the behavior of new physics at the @xmath1-pole . taking these achievements into account one can imagine that the physics of @xmath1-boson will again play the central role in the frontier of particle physics if the next generation @xmath1 factory comes true with the generated @xmath1 events several orders of magnitude higher than that of the lep . this factory can be realized in the gigaz option of the international linear collider ( ilc)@xcite . the ilc is a proposed electron - positron collider with tunable energy ranging from @xmath12 to @xmath13 and polarized beams in its first phase , and the gigaz option corresponds to its operation on top of the resonance of @xmath1 boson by adding a bypass to its main beam line . given the high luminosity , @xmath14 , and the cross section at the resonance of @xmath1 boson , @xmath15 , about @xmath16 @xmath1 events can be generated in an operational year of @xmath17 of gigaz , which implies that the expected sensitivity to the branching ratio of @xmath1-decay can be improved from @xmath18 at the lep to @xmath19 at the gigaz@xcite . in light of this , the @xmath1-boson properties , especially its exotic or rare decays which are widely believed to be sensitive to new physics , should be investigated comprehensively to evaluate their potential in probing new physics .    among the rare @xmath1-decays , the flavor changing ( fc ) processes were most extensively studied to explore the flavor texture in new physics @xcite , and it was found that , although these processes are severely suppressed in the sm , their branching ratios in new physics models can be greatly enhanced to @xmath19 for lepton flavor violation decays @xcite and @xmath20 for quark flavor violation decays @xcite . besides the fc processes , the @xmath1-decay into light higgs boson(s ) is another type of rare process that was widely studied , e.g. the decay @xmath21 ( @xmath22 ) with the particle @xmath0 denoting a light higgs boson was studied in @xcite , the decay @xmath23 was studied in the two higgs doublet model ( 2hdm)@xcite and the minimal supersymmetric standard model ( mssm)@xcite , and the decay @xmath4 was studied in a model independent way @xcite , in 2hdm@xcite and also in mssm@xcite . these studies indicate that , in contrast with the kinematic forbidden of these decays in the sm , the rates of these decays can be as large as @xmath18 in new physics models , which lie within the expected sensitivity of the gigaz . in this work , we extend the previous studies of these decays to some new models and investigate these decays altogether . we are motivated by some recent studies on the singlet extension of the mssm , such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly minimal supersymmetric standard model ( nmssm ) @xcite , where a light cp - odd higgs boson @xmath0 with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry like @xmath24 or peccei - quuin symmetry @xcite . these non - minimal supersymmetric models can not only avoid the @xmath25-problem , but also alleviate the little hierarchy by having such a light higgs boson @xmath0 @xcite . we are also motivated by that , with the latest experiments , the properties of the light higgs boson are more stringently constrained than before . so it is worth updating the previous studies . so far there is no model - independent lower bound on the lightest higgs boson mass . in the sm , it must be heavier than @xmath26 gev , obtained from the null observation of the higgs boson at lep experiments . however , due to the more complex structure of the higgs sector in the extensions of the sm , this lower bound can be significantly relaxed according to recent studies , e.g. , for the cp - odd higgs boson @xmath0 we have @xmath27 gev in the nmssm @xcite , @xmath28 gev in the nmssm @xcite , and @xmath29 gev in the lepton - specific 2hdm ( l2hdm ) @xcite . with such a light cp - odd higgs boson , the z - decay into one or more @xmath0 is open up . noting that the decay @xmath30 is forbidden due to bose symmetry , we in this work study the rare @xmath1-decays @xmath6 ( @xmath22 ) , @xmath31 and @xmath4 in a comparative way for four models , namely the type - ii 2hdm@xcite , the l2hdm @xcite , the nmssm and the nmssm . in our study , we examine carefully the constraints on the light @xmath0 from many latest experimental results . this work is organized as follows . in sec . ii we briefly describe the four new physics models . in sec . iii we present the calculations of the rare @xmath1-decays . in sec . iv we list the constraints on the four new physics models . in sec . v we show the numerical results for the branching ratios of the rare @xmath1-decays in various models . finally , the conclusion is given in sec . as the most economical way , the sm utilizes one higgs doublet to break the electroweak symmetry . as a result , the sm predicts only one physical higgs boson with its properties totally determined by two free parameters . in new physics models , the higgs sector is usually extended by adding higgs doublets and/or singlets , and consequently , more physical higgs bosons are predicted along with more free parameters involved in . the general 2hdm contains two @xmath32 doublet higgs fields @xmath33 and @xmath34 , and with the assumption of cp - conserving , its scalar potential can be parameterized as@xcite : @xmath35,\end{aligned}\ ] ] where @xmath36 ( @xmath37 ) are free dimensionless parameters , and @xmath38 ( @xmath39 ) are the parameters with mass dimension . after the electroweak symmetry breaking , the spectrum of this higgs sector includes three massless goldstone modes , which become the longitudinal modes of @xmath40 and @xmath1 bosons , and five massive physical states : two cp - even higgs bosons @xmath41 and @xmath42 , one neutral cp - odd higgs particle @xmath0 and a pair of charged higgs bosons @xmath43 . noting the constraint @xmath44 with @xmath45 and @xmath46 denoting the vacuum expectation values ( vev ) of @xmath33 and @xmath34 respectively , we choose @xmath47 as the input parameters with @xmath48 , and @xmath49 being the mixing angle that diagonalizes the mass matrix of the cp - even higgs fields . the difference between the type - ii 2hdm and the l2hdm comes from the yukawa coupling of the higgs bosons to quark / lepton . in the type - ii 2hdm , one higgs doublet @xmath34 generates the masses of up - type quarks and the other doublet @xmath33 generates the masses of down - type quarks and charged leptons ; while in the l2hdm one higgs doublet @xmath33 couples only to leptons and the other doublet @xmath34 couples only to quarks . so the yukawa interactions of @xmath0 to fermions in these two models are given by @xcite @xmath50 with @xmath51 denoting generation index . obviously , in the type - ii 2hdm the @xmath52 coupling and the @xmath53 coupling can be simultaneously enhanced by @xmath54 , while in the l2hdm only the @xmath53 coupling is enhanced by @xmath55 . the structures of the nmssm and the nmssm are described by their superpotentials and corresponding soft - breaking terms , which are given by @xcite @xmath56 where @xmath57 is the superpotential of the mssm without the @xmath25 term , @xmath58 and @xmath59 are higgs doublet and singlet superfields with @xmath60 and @xmath61 being their scalar component respectively , @xmath62 , @xmath63 , @xmath64 , @xmath65 , @xmath66 and @xmath67 are soft breaking parameters , and @xmath68 and @xmath69 are coefficients of the higgs self interactions .    with the superpotentials and the soft - breaking terms , one can get the higgs potentials of the nmssm and the nmssm respectively . like the 2hdm , the higgs bosons with same cp property will mix and the mass eigenstates are obtained by diagonalizing the corresponding mass matrices : @xmath70 where the fields on the right hands of the equations are component fields of @xmath71 , @xmath72 and @xmath61 defined by @xmath73 @xmath74 and @xmath75 are respectively the cp - even and cp - odd neutral higgs bosons , @xmath76 and @xmath77 are goldstone bosons eaten by @xmath1 and @xmath78 , and @xmath79 is the charged higgs boson . so both the nmssm and nmssm predict three cp - even higgs bosons , two cp - odd higgs bosons and one pair of charged higgs bosons . in general , the lighter cp - odd higgs @xmath0 in these model is the mixture of the singlet field @xmath80 and the doublet field combination , @xmath81 , i.e. @xmath82 and its couplings to down - type quarks are then proportional to @xmath83 . so for singlet dominated @xmath0 , @xmath84 is small and the couplings are suppressed . as a comparison , the interactions of @xmath0 with the squarks are given by@xcite @xmath85 i.e. the interaction does not vanish when @xmath86 approaches zero . just like the 2hdm where we use the vevs of the higgs fields as fundamental parameters , we choose @xmath68 , @xmath69 , @xmath87 , @xmath88 , @xmath66 and @xmath89 as input parameters for the nmssm@xcite and @xmath68 , @xmath54 , @xmath88 , @xmath65 , @xmath90 and @xmath91 as input parameters for the nmssm@xcite . about the nmssm and the nmssm , three points should be noted . the first is for the two models , there is no explicit @xmath92term , and the effective @xmath25 parameter ( @xmath93 ) is generated when the scalar component of @xmath59 develops a vev . the second is , the nmssm is actually same as the nmssm with @xmath94@xcite , because the tadpole terms @xmath95 and its soft breaking term @xmath96 in the nmssm do not induce any interactions , except for the tree - level higgs boson masses and the minimization conditions . and the last is despite of the similarities , the nmssm has its own peculiarity , which comes from its neutralino sector . in the basis @xmath97 , its neutralino mass matrix is given by @xcite @xmath98 where @xmath99 and @xmath100 are @xmath101 and @xmath102 gaugino masses respectively , @xmath103 , @xmath104 , @xmath105 and @xmath106 . after diagonalizing this matrix one can get the mass eigenstate of the lightest neutralino @xmath107 with mass taking the following form @xcite @xmath108 this expression implies that @xmath107 must be lighter than about @xmath109 gev for @xmath110 ( from lower bound on chargnio mass ) and @xmath111 ( perturbativity bound ) . like the other supersymmetric models , @xmath107 as the lightest sparticle acts as the dark matter in the universe , but due to its singlino - dominated nature , it is difficult to annihilate sufficiently to get the correct density in the current universe . so the relic density of @xmath107 plays a crucial way in selecting the model parameters . for example , as shown in @xcite , for @xmath112 , there is no way to get the correct relic density , and for the other cases , @xmath107 mainly annihilates by exchanging @xmath1 boson for @xmath113 , or by exchanging a light cp - odd higgs boson @xmath0 with mass satisfying the relation @xmath114 for @xmath115 . for the annihilation , @xmath54 and @xmath25 are required to be less than 10 and @xmath116 respectively because through eq.([mass - exp ] ) a large @xmath87 or @xmath25 will suppress @xmath117 to make the annihilation more difficult . the properties of the lightest cp - odd higgs boson @xmath0 , such as its mass and couplings , are also limited tightly since @xmath0 plays an important role in @xmath107 annihilation . the phenomenology of the nmssm is also rather special , and this was discussed in detail in @xcite . in the type - ii 2hdm , l2hdm , nmssm and nmssm , the rare @xmath1-decays @xmath118 ( @xmath22 ) , @xmath3 and @xmath4 may proceed by the feynman diagrams shown in fig.[fig1 ] , fig.[fig2 ] and fig.[fig3 ] respectively . for these diagrams , the intermediate state @xmath119 represents all possible cp - even higgs bosons in the corresponding model , i.e. @xmath41 and @xmath42 in type - ii 2hdm and l2hdm and @xmath41 , @xmath42 and @xmath120 in nmssm and nmssm . in order to take into account the possible resonance effects of @xmath119 in fig.[fig1](c ) for @xmath2 and fig.[fig3 ] ( a ) for @xmath11 , we have calculated all the decay modes of @xmath119 and properly included the width effect in its propagator . as to the decay @xmath121 , two points should be noted . one is , unlike the decays @xmath6 and @xmath11 , this process proceeds only through loops mediated by quarks / leptons in the type - ii 2hdm and l2hdm , and additionally by sparticles in the nmssm and nmssm . so in most cases its rate should be much smaller than the other two . the other is due to cp - invariance , loops mediated by squarks / sleptons give no contribution to the decay@xcite . in actual calculation , this is reflected by the fact that the coupling coefficient of @xmath122 differs from that of @xmath123 by a minus sign ( see eq.([asqsq ] ) ) , and as a result , the squark - mediated contributions to @xmath121 are completely canceled out .    with regard to the rare decay @xmath11 , we have more explanations . in the lowest order , this decay proceeds by the diagram shown in fig.[fig3 ] ( a ) , and hence one may think that , as a rough estimate , it is enough to only consider the contributions from fig.[fig3](a ) . however , we note that in some cases of the type - ii 2hdm and l2hdm , due to the cancelation of the contributions from different @xmath119 in fig.[fig3 ] ( a ) and also due to the potentially largeness of @xmath124 couplings ( i.e. larger than the electroweak scale @xmath125 ) , the radiative correction from the higgs - mediated loops may dominate over the tree level contribution even when the tree level prediction of the rate , @xmath126 , exceeds @xmath20 . on the other hand , we find the contribution from quark / lepton - mediated loops can be safely neglected if @xmath127 in the type - ii 2hdm and the l2hdm . in the nmssm and the nmssm , besides the corrections from the higgs- and quark / lepton - mediated loops , loops involving sparticles such as squarks , charginos and neutralinos can also contribute to the decay . we numerically checked that the contributions from squarks and charginos can be safely neglected if @xmath127 . we also calculated part of potentially large neutralino correction ( note that there are totally about @xmath128 diagrams for such correction ! ) and found they can be neglected too . since considering all the radiative corrections will make our numerical calculation rather slow , we only include the most important correction , namely that from higgs - mediated loops , in presenting our results for the four models . one can intuitively understand the relative smallness of the sparticle contribution to @xmath11 as follows . first consider the squark contribution which is induced by the @xmath129 interaction ( @xmath130 denotes the squark in chirality state ) and the @xmath131 interaction through box diagrams . because the @xmath132 interaction conserves the chirality of the squarks while the @xmath133 interaction violates the chirality , to get non - zero contribution to @xmath11 from the squark loops , at least four chiral flippings are needed , with three of them provided by @xmath131 interaction and the rest provided by the left - right squark mixing . this means that , if one calculates the amplitude in the chirality basis with the mass insertion method , the amplitude is suppressed by the mixing factor @xmath134 with @xmath135 being the off diagonal element in squark mass matrix . next consider the chargino / neutralino contributions . since for a light @xmath0 , its doublet component , parameterized by @xmath84 in eq.([mixing ] ) , is usually small , the couplings of @xmath0 with the sparticles will never be tremendously large@xcite . so the chargino / neutralino contributions are not important too . in our calculation of the decays , we work in the mass eigenstates of sparticles instead of in the chirality basis . for the type - ii 2hdm and the l2hdm , we consider the following constraints @xcite :    * theoretical constraints on @xmath136 from perturbativity , unitarity and requirements that the scalar potential is finit at large field values and contains no flat directions @xcite , which imply that @xmath137 * the constraints from the lep search for neutral higgs bosons . we compute the signals from the higgs - strahlung production @xmath138 ( @xmath139 ) with @xmath140 @xcite and from the associated production @xmath141 with @xmath142 @xcite , and compare them with the corresponding lep data which have been inputted into our code . we also consider the constraints from @xmath138 by looking for a peak of @xmath143 recoil mass distribution of @xmath1-boson @xcite and the constraint of @xmath144 mev when @xmath145 @xcite . + these constraints limit the quantities such as @xmath146 \times br ( h_i \to \bar{b } b ) $ ] on the @xmath147 plane with the the subscript @xmath148 denoting the coupling coefficient of the @xmath149 interaction . they also impose a model - dependent lower bound on @xmath150 , e.g. , @xmath151 for the type - ii 2hdm ( from our scan results ) , @xmath152 for the l2hdm@xcite , and @xmath153 for the nmssm @xcite . these bounds are significantly lower than that of the sm , i.e. @xmath154 , partially because in new physics models , unconventional decay modes of @xmath155 such as @xmath156 are open up . as to the nmssm , another specific reason for allowing a significantly lighter cp - even higgs boson is that the boson may be singlet - dominated in this model . + with regard to the lightest cp - odd higgs boson @xmath0 , we checked that there is no lower bound on its mass so long as the @xmath157 interaction is weak or @xmath155 is sufficiently heavy . * the constraints from the lep search for a light higgs boson via the yukawa process @xmath158 with @xmath22 and @xmath61 denoting a scalar @xcite . these constraints can limit the @xmath159 coupling versus @xmath160 in new physics models . * the constraints from the cleo - iii limit on @xmath161 and the latest babar limits on @xmath162 . these constraints will put very tight constraints on the @xmath163 coupling for @xmath164 . in our analysis , we use the results of fig.8 in the second paper of @xcite to excluded the unfavored points . * the constraints from @xmath165 couplings . since the higgs sector can give sizable higher order corrections to @xmath165 couplings , we calculate them to one loop level and require the corrected @xmath165 couplings to lie within the @xmath166 range of their fitted value . the sm predictions for the couplings at @xmath1-pole are given by @xmath167 and @xmath168 @xcite , and the fitted values are given by @xmath169 and @xmath170 , respectively@xcite . we adopt the formula in @xcite to the 2hdm in our calculation . * the constraints from @xmath171 leptonic decay . we require the new physics correction to the branching ratio @xmath172 to be in the range of @xmath173 @xcite . we use the formula in @xcite in our calculation . + about the constraints ( 5 ) and ( 6 ) , two points should be noted . one is all higgs bosons are involved in the constraints by entering the self energy of @xmath171 lepton , the @xmath174 vertex correction or the @xmath175 vertex correction , and also the box diagrams for @xmath176@xcite . since the yukawa couplings of the higgs bosons to @xmath171 lepton get enhanced by @xmath54 and so do the corrections , @xmath54 must be upper bounded for given spectrum of the higgs sector . generally speaking , the lighter @xmath0 is , the more tightly @xmath54 is limited@xcite . the other point is in the type - ii 2hdm , @xmath177 , b - physics observables as well as @xmath178 decays discussed above can constraint the model in a tighter way than the constraints ( 5 ) and ( 6 ) since the yukawa couplings of @xmath171 lepton and @xmath179 quark are simultaneously enhanced by @xmath54 . but for the l2hdm , because only the yukawa couplings of @xmath171 lepton get enhanced ( see eq.[yukawa ] ) , the constraints ( 5 ) and ( 6 ) are more important in limiting @xmath54 . * indirect constraints from the precision electroweak observables such as @xmath180 , @xmath181 and @xmath182 , or their combinations @xmath183 @xcite . we require @xmath184 to be compatible with the lep / sld data at @xmath185 confidence level@xcite . we also require new physics prediction of @xmath186 is within the @xmath187 range of its experimental value . the latest results for @xmath188 are @xmath189 ( measured value ) and @xmath190 ( sm prediction ) for @xmath191 gev @xcite . in our code , we adopt the formula for these observables presented in @xcite to the type - ii 2hdm and the l2hdm respectively . + in calculating @xmath180 , @xmath181 and @xmath182 , we note that these observables get dominant contributions from the self energies of the gauge bosons @xmath1 , @xmath192 and @xmath193 . since there is no @xmath194 coupling or @xmath195 coupling , @xmath0 must be associated with the other higgs bosons to contribute to the self energies . so by the uv convergence of these quantities , one can infer that , for the case of a light @xmath0 and @xmath196 , these quantities depend on the spectrum of the higgs sector in a way like @xmath197 at leading order , which implies that a light @xmath0 can still survive the constraints from the precision electroweak observables given the splitting between @xmath150 and @xmath198 is moderate@xcite . * the constraints from b physics observables such as the branching ratios for @xmath199 , @xmath200 and @xmath201 , and the mass differences @xmath202 and @xmath203 . we require their theoretical predications to agree with the corresponding experimental values at @xmath187 level . + in the type - ii 2hdm and the l2hdm , only the charged higgs boson contributes to these observables by loops , so one can expect that @xmath198 versus @xmath54 is to be limited . combined analysis of the limits in the type - ii 2hdm has been done by the ckmfitter group , and the lower bound of @xmath204 as a function of @xmath87 was given in fig.11 of @xcite . this analysis indicates that @xmath198 must be heavier than @xmath205 at @xmath185 c.l . regardless the value of @xmath54 . in this work , we use the results of fig.11 in @xcite to exclude the unfavored points . as for the l2hdm , b physics actually can not put any constraints@xcite because in this model the couplings of the charged higgs boson to quarks are proportional to @xmath206 and in the case of large @xmath54 which we are interested in , they are suppressed . in our analysis of the l2hdm , we impose the lep bound on @xmath198 , i.e. @xmath207@xcite . * the constraints from the muon anomalous magnetic moment @xmath208 . now both the theoretical prediction and the experimental measured value of @xmath208 have reached a remarkable precision , but a significant deviation still exists : @xmath209 @xcite . in the 2hdm , @xmath208 gets additional contributions from the one - loop diagrams induced by the higgs bosons and also from the two - loop barr - zee diagrams mediated by @xmath0 and @xmath155@xcite . if the higgs bosons are much heavier than @xmath25 lepton mass , the contributions from the barr - zee diagrams are more important , and to efficiently alleviate the discrepancy of @xmath208 , one needs a light @xmath0 along with its enhanced couplings to @xmath25 lepton and also to heavy fermions such as bottom quark and @xmath171 lepton to push up the effects of the barr - zee diagram@xcite . the cp - even higgs bosons are usually preferred to be heavy since their contributions to @xmath208 are negative . + in the type - ii 2hdm , because @xmath54 is tightly constrained by the process @xmath210 at the lep@xcite and the @xmath178 decay@xcite , the barr - zee diagram contribution is insufficient to enhance @xmath208 to @xmath187 range around its measured value@xcite . so in our analysis , we require the type - ii 2hdm to explain @xmath208 at @xmath211 level . while for the l2hdm , @xmath54 is less constrained compared with the type - ii 2hdm , and the barr - zee diagram involving the @xmath171-loop is capable to push up greatly the theoretical prediction of @xmath208@xcite . therefore , we require the l2hdm to explain the discrepancy at @xmath187 level . + unlike the other constraints discussed above , the @xmath208 constraint will put a two - sided bound on @xmath54 since on the one hand , it needs a large @xmath54 to enhance the barr - zee contribution , but on the other hand , too large @xmath54 will result in an unacceptable large @xmath208 . * since this paper concentrates on a light @xmath0 , the decay @xmath212 is open up with a possible large decay width . we require the width of any higgs boson to be smaller than its mass to avoid a too fat higgs boson@xcite . we checked that for the scenario characterized by @xmath213 , the coefficient of @xmath214 interaction is usually larger than the electroweak scale @xmath125 , and consequently a large decay width is resulted . for the nmssm and nmssm , the above constraints become more complicated because in these models , not only more higgs bosons are involved in , but also sparticles enter the constraints . so it is not easy to understand some of the constraints intuitively . take the process @xmath199 as an example . in the supersymmetric models , besides the charged higgs contribution , chargino loops , gluino loops as well as neutralino loops also contribute to the process@xcite , and depending on the susy parameters , any of these contributions may become dominated over or be canceled by other contributions . as a result , although the charged higgs affects the process in the same way as that in the type - ii 2hdm , charged higgs as light as @xmath215 is still allowed even for @xmath216@xcite .    since among the constraints , @xmath208 is rather peculiar in that it needs new physics to explain the discrepancy between @xmath217 and @xmath218 , we discuss more about its dependence on susy parameters . in the nmssm and the nmssm , @xmath208 receives contributions from higgs loops and neutralino / chargino loops . for the higgs contribution , it is quite similar to that of the type - ii 2hdm except that more higgs bosons are involved in@xcite . for the neutralino / chargino contribution , in the light bino limit ( i.e. @xmath219 ) , it can be approximated by@xcite @xmath220 for @xmath221 with @xmath222 being smuon mass . so combining the two contributions together , one can learn that a light @xmath0 along with large @xmath54 and/or light smuon with moderate @xmath87 are favored to dilute the discrepancy .    because more parameters are involved in the constraints on the supersymmetric models , we consider following additional constraints to further limit their parameters :    * direct bounds on sparticle masses from the lep1 , the lep2 and the tevatron experiments @xcite . * the lep1 bound on invisible z decay @xmath223 ; the lep2 bound on neutralino production @xmath224 and @xmath225@xcite . * dark matter constraints from the wmap relic density 0.0975 @xmath226 0.1213 @xcite . note that among the above constraints , the constraint ( 2 ) on higgs sector and the constraint ( c ) on neutralino sector are very important . this is because in the supersymmetric models , the sm - like higgs is upper bounded by about @xmath227 at tree level and by about @xmath228 at loop level , and that the relic density restricts the lsp annihilation cross section in a certain narrow range .    in our analysis of the nmssm , we calculate the constraints ( 3 ) and ( 5 - 7 ) by ourselves and utilize the code nmssmtools @xcite to implement the rest constraints . we also extend nmssmtools to the nmssm to implement the constraints . for the extension , the most difficult thing we faced is how to adapt the code micromegas@xcite to the nmssm case . we solve this problem by noting the following facts :    * as we mentioned before , the nmssm is actually same as the nmssm with the trilinear singlet term setting to zero . so we can utilize the model file of the nmssm as the input of the micromegas and set @xmath229 . * since in the nmssm , the lsp is too light to annihilate into higgs pairs , there is no need to reconstruct the effective higgs potential to calculate precisely the annihilation channel @xmath230 with @xmath61 denoting any of higgs bosons@xcite . we thank the authors of the nmssmtools for helpful discussion on this issue when we finish such extension@xcite . with the above constraints , we perform four independent random scans over the parameter space of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively . we vary the parameters in following ranges : @xmath231 for the type - ii 2hdm , @xmath232 for the l2hdm , @xmath233 for the nmssm , and @xmath234 for the nmssm .    in performing the scans , we note that for the nmssm and the nmssm , some constraints also rely on the gaugino masses and the soft breaking parameters in the squark sector and the slepton sector . since these parameters affect little on the properties of @xmath0 , we fix them to reduce the number of free parameters in our scan . for the squark sector , we adopt the @xmath235 scenario which assumes that the soft mass parameters for the third generation squarks are degenerate : @xmath236 800 gev , and that the trilinear couplings of the third generation squarks are also degenerate , @xmath237 with @xmath238 . for the slepton sector , we assume all the soft - breaking masses and trilinear parameters to be 100 gev . this setting is necessary for the nmssm since this model is difficult to explain the muon anomalous moment at @xmath239 level for heavy sleptons@xcite . finally , we assume the grand unification relation @xmath240 for the gaugino masses with @xmath241 being fine structure constants of the different gauge group .    with large number of random points in the scans , we finally get about @xmath242 , @xmath243 , @xmath244 and @xmath242 samples for the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively which survive the constraints and satisfy @xmath245 . analyzing the properties of the @xmath0 indicates that for most of the surviving points in the nmssm and the nmssm , its dominant component is the singlet field ( numerically speaking , @xmath246 ) so that its couplings to the sm fermions are suppressed@xcite . our analysis also indicates that the main decay products of @xmath0 are @xmath247 for the l2hdm@xcite , @xmath248 ( dominant ) and @xmath247 ( subdominant ) for the type - ii 2hdm , the nmssm and the nmssm , and in some rare cases , neutralino pairs in the nmssm@xcite .    in fig.[fig4 ] , we project the surviving samples on the @xmath249 plane . this figure shows that the allowed range of @xmath54 is from @xmath250 to @xmath251 in the type - ii 2hdm , and from @xmath252 to @xmath253 in the l2hdm . just as we introduced before , the lower bounds of @xmath254 come from the fact that we require the models to explain the muon anomalous moment , while the upper bound is due to we have imposed the constraint from the lep process @xmath255 , which have limited the upper reach of the @xmath256 coupling for light @xmath61 @xcite(for the dependence of @xmath256 coupling on @xmath54 , see sec . this figure also indicates that for the nmssm and the nmssm , @xmath54 is upper bounded by @xmath257 . for the nmssm , this is because large @xmath87 can suppress the dark matter mass to make its annihilation difficult ( see @xcite and also sec . ii ) , but for the nmssm , this is because we choose a light slepton mass so that large @xmath54 can enhance @xmath208 too significantly to be experimentally unacceptable . we checked that for the slepton mass as heavy as @xmath258 , @xmath259 is still allowed for the nmssm .    in fig.[fig5 ] and fig.[fig6 ] , we show the branching ratios of @xmath260 and @xmath261 respectively . fig.[fig5 ] indicates , among the four models , the type - ii 2hdm predicts the largest ratio for @xmath260 with its value varying from @xmath262 to @xmath263 . the underlying reason is in the type - ii 2hdm , the @xmath264 coupling is enhanced by @xmath54 ( see fig.[fig4 ] ) , while in the other three model , the coupling is suppressed either by @xmath265 or by the singlet component of the @xmath0 . fig.[fig6 ] shows that the l2hdm predicts the largest rate for @xmath266 with its value reaching @xmath5 in optimum case , and for the other three models , the ratio of @xmath261 is at least about one order smaller than that of @xmath267 . this feature can be easily understood from the @xmath268 coupling introduced in sect . we emphasize that , if the nature prefers a light @xmath0 , @xmath260 and/or @xmath269 in the type - ii 2hdm and the l2hdm will be observable at the gigaz . then by the rates of the two decays , one can determine whether the type - ii 2hdm or the l2hdm is the right theory . on the other hand , if both decays are observed with small rates or fail to be observed , the singlet extensions of the mssm are favored .    in fig.[fig7 ] , we show the rate of @xmath3 as the function of @xmath270 . this figure indicates that the branching ratio of @xmath121 can reach @xmath271 , @xmath272 , @xmath273 and @xmath274 for the optimal cases of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively , which implies that the decay @xmath121 will never be observable at the gigaz if the studied model is chosen by nature . the reason for the smallness is , as we pointed out before , that the decay @xmath121 proceeds only at loop level . comparing the optimum cases of the type - ii 2hdm , the nmssm and the nmssm shown in fig.5 - 7 , one may find that the relation @xmath275 holds for any of the decays . this is because the decays are all induced by the yukawa couplings with similar structure for the models . in the supersymmetric models , the large singlet component of the light @xmath0 is to suppress the yukawa couplings , and the @xmath0 in the nmssm has more singlet component than that in the nmssm . next we consider the decay @xmath11 , which , unlike the above decays , depends on the higgs self interactions . in fig.[fig8 ] we plot its rate as a function of @xmath270 and this figure indicates that the @xmath276 may be the largest among the ratios of the exotic @xmath1 decays , reaching @xmath277 in the optimum cases of the type - ii 2hdm , the l2hdm and the nmssm . the underlying reason is , in some cases , the intermediate state @xmath119 in fig.[fig3 ] ( a ) may be on - shell . in fact , we find this is one of the main differences between the nmssm and the nmssm , that is , in the nmssm , @xmath119 in fig.[fig3 ] ( a ) may be on - shell ( corresponds to the points with large @xmath278 ) while in the nmssm , this seems impossible . so we conclude that the decay @xmath11 may serve as an alternative channel to test new physics models , especially it may be used to distinguish the nmssm from the nmssm if the supersymmetry is found at the lhc and the @xmath11 is observed at the gigaz with large rate . before we end our discussion , we note that in the nmssm , the higgs boson @xmath0 may be lighter than @xmath279 without conflicting with low energy data from @xmath178 decays and the other observables ( see fig.[fig4]-[fig8 ] ) . in this case , @xmath0 is axion - like as pointed out in @xcite . we checked that , among the rare @xmath1 decays discussed in this paper , the largest branching ratio comes from @xmath280 which can reach @xmath281 . since in this case , the decay product of @xmath0 is highly collinear muon pair , detecting the decay @xmath280 may need some knowledge about detectors , which is beyond our discussion . in this paper , we studied the rare @xmath1-decays @xmath2 ( @xmath7 ) , @xmath282 and @xmath4 in the type - ii 2hdm , lepton - specific 2hdm , nmssm and nmssm , which predict a light cp - odd higgs boson @xmath0 . in the parameter space allowed by current experiments , the branching ratio can be as large as @xmath5 for @xmath118 , @xmath8 for @xmath3 and @xmath9 for @xmath4 , which implies that the decays @xmath2 and @xmath283 may be accessible at the gigaz option . since different models predict different size of branching ratios , these decays can be used to distinguish different model through the measurement of these rare decays . this work was supported in part by hastit under grant no . 2009hastit004 , by the national natural science foundation of china ( nnsfc ) under grant nos . 10821504 , 10725526 , 10635030 , 10775039 , 11075045 and by the project of knowledge innovation program ( pkip ) of chinese academy of sciences under grant no . .        for some reviews , see , e.g. , m.  a.  perez , g.  tavares - velasco and j.  j.  toscano , int . j.  mod . a * 19 * , 159 ( 2004 ) ; j. m. yang , arxiv:1006.2594 . j.  i.  illana , m.  masip , 67 , 035004 ( 2003 ) ; j. cao , z. xiong , j. m. yang , 32 , 245 ( 2004 ) . d. atwood _ et al_. , 66 , 093005 ( 2002 ) . j. kalinowski , and s. pokorski , 219 , 116 ( 1989 ) ; a. djouadi , p. m. zerwas and j. zunft , 259 , 175 ( 1991 ) ; a. djouadi , j. kalinowski , and p. m. zerwas , z. phys . c * 54 * , 255 ( 1992 ) . m. krawczyk , _ et al . _ , 19 , 463 ( 2001 ) ; 8 , 495 ( 1999 ) . j. f. gunion , g. gamberini and s. f. novaes , 38 , 3481 ( 1988 ) ; thomas j. weiler and tzu - chiang yuan , 318 , 337 ( 1989 ) ; a. djouadi , _ et al . _ , 1 , 163 ( 1998)[hep - ph/9701342 ] . d.  chang and w.  y.  keung , phys . lett .  * 77 * , 3732 ( 1996 ) . e.  keith and e.  ma , 57 , 2017 ( 1998 ) ; m.  a.  perez , g.  tavares - velasco and j.  j. toscano , int . j.  mod.phys . a * 19 * , 159 ( 2004 ) . f.  larios , g.  tavares - velasco and c. p.  yuan , 64 , 055004 ( 2001 ) ; 66 , 075006 ( 2002 ) . a. djouadi , _ et al . _ , 10 , 27 ( 1999 ) [ hep - ph/9903229 ] . for a detailed introduction of the nmssm , see f.  franke and h. fraas , int . j.  mod . a * 12 * ( 1997 ) 479 ; for a recent review of the nmssm , see for example , u. ellwanger , c. hugonie , and a. m. teixeira , arxiv : 0910.1785 . see , e.g. , j.  r.  ellis , j.  f.  gunion , h.  e.  haber , l.  roszkowski and f.  zwirner , phys .  rev . d * 39 * ( 1989 ) 844 ; m.  drees , int . j.  mod . phys .  a * 4 * ( 1989 ) 3635 ; u.  ellwanger , m.  rausch de traubenberg and c.  a.  savoy , phys . b * 315 * ( 1993 ) 331 ; nucl . b * 492 * ( 1997 ) 21 ; d.j . miller , r. nevzorov , p.m. zerwas , 681 , 3 ( 2004 ) .    c.  panagiotakopoulos , k.  tamvakis , 446 , 224 ( 1999 ) ; 469 , 145 ( 1999 ) ; c. panagiotakopoulos , a. pilaftsis , 63 , 055003 ( 2001 ) ; a.  dedes , _ et al . _ , 63 , 055009 ( 2001 ) ; a.  menon , _ et al . _ , 70 , 035005 ( 2004 ) ; v.  barger , _ et al . _ , 630 , 85 ( 2005 ) . c.  balazs , _ et al . _ , 0706 , 066 ( 2007 ) . b. a. dobrescu , k. t. matchev , 0009 , 031 ( 2000 ) ; a. arhrib , k. cheung , t. j. hou , k. w. song , hep - ph/0611211 ; 0703 , 073 ( 2007 ) ; x. g. he , j. tandean , and g. valencia , 98 , 081802 ( 2007 ) ; 0806 , 002 ( 2008 ) ; f. domingo _ et al_. , 0901 , 061 ( 2009 ) ; gudrun hiller , 70 , 034018 ( 2004 ) ; r. dermisek , and john f. gunion , 75 , 075019 ( 2007 ) ; 79 , 055014 ( 2009 ) ; 81 , 055001 ( 2010 ) ; r. dermisek , john f. gunion , and b. mcelrath , 76 , 051105 ( 2007 ) ; z. heng , _ et al_. , 77 , 095012 ( 2008 ) ; a. belyaev _ et al_. , 81 , 075021 ( 2010 ) ; d. das and u.  ellwanger , arxiv:1007.1151 [ hep - ph ] . s.  andreas , o.  lebedev , s.  ramos - sanchez and a.  ringwald , arxiv:1005.3978 [ hep - ph ] . j.  f.  gunion , jhep * 0908 * , 032 ( 2009 ) ; r. dermisek and j.  f.  gunion , phys .  rev . d * 81 * , 075003 ( 2010 ) . r.  dermisek and j.  f. gunion , phys . lett .   * 95 * , 041801 ( 2005 ) ; phys . d * 73 * , 111701 ( 2006 ) . j. cao , h. e. logan , j. m. yang , 79 , 091701 ( 2009 ) . j. cao , p. wan , l. wu , j. m. yang , 80 , 071701 ( 2009 ) . j. f. gunion and h. e. haber , 67 , 075019 ( 2003 ) . r.  m.  barnett , _ et al . _ , phys . b * 136 * , 191 ( 1984 ) ; r.  m.  barnett , g.  senjanovic and d.  wyler , phys . d * 30 * , 1529 ( 1984 ) ; y.  grossman , nucl . b * 426 * , 355 ( 1994 ) . h.  s.  goh , l.  j.  hall and p. kumar , jhep * 0905 * , 097 ( 2009 ) ; a.  g. akeroyd and w.  j.  stirling , nucl . b * 447 * , 3 ( 1995 ) ; a.  g.  akeroyd , phys . b * 377 * , 95 ( 1996 ) ; h.  e.  logan and d.  maclennan , phys .  rev . d * 79 * , 115022 ( 2009 ) ; m. aoki , _ et al . _ , arxiv:0902.4665 [ hep - ph ] . v.  barger , p.  langacker , h.  s.  lee and g. shaughnessy , phys . d * 73 * , 115010 ( 2006 ) . s. hesselbach , _ et . _ , arxiv:0810.0511v2 [ hep - ph ] . de vivie and p.  janot [ aleph collaboration ] , pa13 - 027 contribution to the international conference on high energy physics , warsaw , poland , 2531 july 1996 ; j. kurowska , o.  grajek and p.  zalewski [ delphi collaboration ] , cern - open-99 - 385 . [ aleph collaboration and delphi collaboration and l3 collaboration ] , phys . rept .   * 427 * , 257 ( 2006 ) . j.  cao and j.  m.  yang , jhep * 0812 * , 006 ( 2008 ) . m.  krawczyk and d.  temes , eur . j.   c * 44 * , 435 ( 2005 ) . g.  altarelli and r.  barbieri , 253 , 161 ( 1991 ) ; m. e. peskin , t. takeuchi , 46 , 381 ( 1992 ) . c. amsler , _ et al . _ , ( particle data group ) , 667 , 1 ( 2008 ) . o. deschamps , s.  descotes - genon , s.  monteil , v.  niess , s.  tjampens and v.  tisserand , arxiv:0907.5135 [ hep - ph ] . s.  su and b. thomas , phys . d * 79 * , 095014 ( 2009 ) . g. abbiendi , _ et al . _ , eur .  phys . j.   c * 32 * , 453 ( 2004 ) . m.  davier , _ et al . _ , 66 , 1 ( 2010 ) . k.  cheung , _ et al . _ , phys . d * 64 * , 111301 ( 2001 ) . k.  cheung and o.  c.  w. kong , phys . d * 68 * , 053003 ( 2003 ) . t. besmer , c. greub , t.hurth , 609 , 359 ( 2001 ) ; f. borzumati , _ et al . _ , 62 , 075005(2000 ) . j.  cao , k.  i.  hikasa , w.  wang , j.  m.  yang and l.  x.  yu , phys . d * 82 * , 051701 ( 2010 ) [ arxiv:1006.4811 [ hep - ph ] ] . j.  f.  gunion , _ et . d * 73 * , 015011 ( 2006 ) . martin and j.  d.  wells , phys . d * 64 * , 035003 ( 2001 ) . j.  abdallah _ et al . _ , eur . j.   c * 31 * , 421 ( 2004 ) ; g.  abbiendi _ et al . _ , eur . j. c * 35 * , 1 ( 2004 ) . j.  dunkley _ et al . _ [ wmap collaboration ] , astrophys . j.  suppl . * 180 * , 306 ( 2009 ) [ arxiv:0803.0586 [ astro - ph ] ] . u. ellwanger _ et al . _ , 02 , 066 ( 2005 ) . g.  belanger , f.  boudjema , a.  pukhov and a.  semenov , comput . commun .   * 174 * , 577 ( 2006 ) ; comput . phys .  commun . * 176 * , 367 ( 2007 ) . g.  belanger , f.  boudjema , c. hugonie , a.  pukhov and a.  semenov , jcap * 0509 * , 001 ( 2005 ) ."""
+
+        ARTICLE_MAGNET = """it is well known that the classical magnetoresistance ( mr ) in metals or semiconductors with a closed free electron fermi surface increases quadratically with increasing magnetic field @xmath2 for @xmath3 and saturates when @xmath4 . here @xmath5 is the zero - magnetic - field mobility . hence , the extraordinarily high and linear mr ( lmr ) , which breaks this familiar rule , has been gaining much attention as soon as its discovery . in the past decade , this unexpected lmr has been reported in silver chalcogenide,@xcite indium antimonide,@xcite silicon,@xcite mnas - gaas composite material,@xcite and graphene.@xcite    kapitza s linear law@xcite indicates that the metal shows a magnetoresistance linear in perpendicular magnetic field when it has an open fermi surface and a mean free path longer than the electronic larmor radius . recently , another two models , irrespective of the open fermi surface , have been constructed to provide possible mechanisms for the lmr phenomenon . abrikosov suggested a quantum - limit origin of lmr for the homogenous system with a gapless linear energy spectrum.@xcite his model requires that landau levels are well formed and the carrier concentration is small that all electrons occupy only the lowest landau band . alternatively , parish and littlewood developed a classical model without involving linear spectrum.@xcite ignoring the concrete microscopic mechanism , they attributed this unusual mr to the mobility fluctuations in a strongly inhomogenous system . topological insulators@xcite ( tis ) are novel materials with a full energy gap in bulk , while there are gapless surface states . due to its unique band structure with only one helical dirac cone and linear energy dispersion,@xcite the surface states of the ti bi@xmath0se@xmath1 become an excellent platform for the study of quantum - limit lmr . the recent experiment in this flat surface system , however , reported that a large positive mr , which becomes very linear above a characteristic field of @xmath6@xmath7@xmath8 t , was observed even in an opposite situation where the carrier sheet density is high that electrons occupy more than one landau levels.@xcite moreover , they found that raising temperature to room temperature almost has no influence on the observed lmr . it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model . so far a reliable theoretical scheme capable of explaining this novel experiment has still been lacking .    in this paper , we generalize the balance - equation approach@xcite to a system modeling the surface states of a three - dimensional ti to investigate the two - dimensional magnetotransport in it . we find that a positive , nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the ti surface state having a positive and finite effective g - factor . this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels , and persists up to room temperature , providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite we consider the surface state of a bi@xmath0se@xmath1-type large bulk gap ti in the @xmath9-@xmath10 plane under the influence of a uniform magnetic field @xmath11 applied along the @xmath12 direction.@xcite following the experimental observation,@xcite we assume that the fermi energy locates in the gap of the bulk band and above the dirac point , i.e. the surface carriers are electrons . further , the separations of the fermi energy from the bottom of bulk band and dirac point are much larger than the highest temperature ( @xmath13 ) considered in this work . hence , the contribution from the bulk band to the magnetotransport is negligible . these electrons , scattered by randomly distributed impurities and by phonons , are driven by a uniform in - plane electric field @xmath14 in the topological surface . the hamiltonian of this many - electron and phonon system consists of an electron part @xmath15 , a phonon part @xmath16 , and electron - impurity and electron - phonon interactions @xmath17 and @xmath18 : @xmath19 here , the electron hamiltonian is taken in the form @xmath20 , \ ] ] in which @xmath21 , @xmath22 , @xmath23 and @xmath24 , stand , respectively , for the canonical momentum , coordinate , momentum and spin operators of the @xmath25th electron having charge @xmath26 , @xmath27 is the vector potential of the perpendicular magnetic field @xmath28 in the landau gauge , @xmath29 is the fermi velocity , @xmath30 is the effective g - factor of the surface electron , and @xmath31 is the bohr magneton with @xmath32 the free electron mass . the sum index @xmath25 in eq.([helectron ] ) goes over all electrons of total number @xmath33 in the surface state of unit area .    in the frame work of balance equation approach,@xcite the two - dimensional center - of - mass ( c.m . ) momentum and coordinate @xmath34 and @xmath35 , and the relative - electron momenta and coordinates @xmath36 and @xmath37 are introduced to write the hamiltonian @xmath15 into the sum of a single - particle c.m . part @xmath38 and a many - particle relative - electron part @xmath39 : @xmath40 , with @xmath41.\end{aligned}\ ] ] in this , @xmath42 is the canonical momentum of the center - of - mass and @xmath43 is the canonical momentum for the @xmath25th relative electron . here we have also introduced c.m . spin operators @xmath44 and @xmath45 . the commutation relations between the c.m . spin operators @xmath46 and @xmath47 and the spin operators @xmath48 , @xmath49 and @xmath50 of the @xmath25th electron are of order of @xmath51 : @xmath52= n^{-1}2\,{\rm i}\,\varepsi lon_{\beta_1\beta_2\beta_3}\sigma_j^{\beta_3}$ ] with @xmath53 . therefore , for a macroscopic large @xmath33 system , the c.m . part @xmath38 actually commutes with the relative - electron part @xmath54 in the hamiltonian , i.e. the c.m . motion and the relative motion of electrons are truly separated from each other . the couplings between the two emerge only through the electron impurity and electron  phonon interactions . furthermore , the electric field @xmath55 shows up only in @xmath38 . and , in view of @xmath56={\rm i}\delta_{\alpha \beta}(\delta_{ij}-1/n)\simeq { \rm i}\delta_{\alpha\beta}\delta_{ij}$ ] , i.e. the relative - electron momenta and coordinates can be treated as canonical conjugate variables , the relative - motion part @xmath54 is just the hamiltonian of @xmath33 electrons in the surface state of ti in the magnetic field without the presence of the electric field .    in terms of the c.m . coordinate @xmath57 and the relative electron density operator @xmath58 , the electron impurity and electron  phonon interactions can be written as@xcite @xmath59 here @xmath60 and @xmath61 are respectively the impurity potential ( an impurity at randomly distributed position @xmath62 ) and electron  phonon coupling matrix element in the plane - wave representation , and @xmath63 with @xmath64 and @xmath65 being the creation and annihilation operators for a phonon of wavevector @xmath66 in branch @xmath67 having frequency @xmath68 . velocity ( operator ) @xmath69 is the time variation of its coordinate : @xmath70= v_{\rm f}(\sigma_{\rm c}^y\ , \hat{i}-\sigma_{\rm c}^x\ , \hat{j})$ ] . to derive a force - balance equation for steady state transport we consider the heisenberg equation for the rate of change of the c.m . canonical momentum @xmath71 : @xmath72= - n e({\bm v}\times { \bm b})- n e{\bm e}+{\bm { f}}_{\rm i}+{\bm { f}}_{\rm p},\ ] ] in which the frictional forces @xmath73 and @xmath74 share the same expressions as given in ref ..    the statistical average of the operator equation can be determined to linear order in the electron  impurity and electron phonon interactions @xmath17 and @xmath18 with the initial density matrix @xmath75 at temperature @xmath76 when the in - plane electric field @xmath77 is not strong . for steady - transport states we have @xmath78 , leading to a force - balance equation of the form @xmath79 here @xmath80 , the statistically averaged velocity of the moving center - of - mass , is identified as the average rate of change of its position , i.e. the drift velocity of the electron system driven by the electric field @xmath77 , and @xmath81 and @xmath82 are frictional forces experienced by the center - of - mass due to impurity and phonon scatterings : @xmath83,\label{fp}\end{aligned}\ ] ] in which @xmath84 is the bose distribution function , @xmath85 , and @xmath86 stands for the imaginary part of the fourier spectrum of the relative - electron density correlation function defined by @xmath87\big\rangle_{0},\ ] ] where @xmath88 and @xmath89 denotes the statistical averaging over the initial density matrix @xmath90.@xcite    the force - balance equation describes the steady - state two - dimensional magnetotransport in the surface state of a ti . note that the frictional forces @xmath81 and @xmath82 are in the opposite direction of the drift velocity @xmath91 and their magnitudes are functions of @xmath92 only . with the drift velocity @xmath93 in the @xmath9 direction , the force - balance equation eq . yields a transverse resistivity @xmath94 , and a longitudinal resistivity @xmath95 . the linear one is in the form @xmath96 for calculating the electron density correlation function @xmath97 we proceed in the landau representation.@xcite the landau levels of the single - particle hamiltonian @xmath98 of the relative - electron system in the absence of electric field are composed of a positive `` @xmath99 '' and a negative `` @xmath100 '' branch@xcite @xmath101 with @xmath102 and @xmath103 , and a zero ( @xmath104 ) level @xmath105 the corresponding landau wave functions are @xmath106 and @xmath107 for @xmath108 ; and @xmath109 for @xmath104 . here @xmath110 is the wavevector of the system along @xmath9 direction ; @xmath111 with @xmath112 ; and @xmath113 is the harmonic oscillator eigenfunction with @xmath114 being the hermite polynomial , @xmath115 , and @xmath116 . each landau level contains @xmath117 electron states for system of unit surface area . the positive branch @xmath118 and the @xmath104 level @xmath119 of the above energy spectra are indeed quite close to those of the surface states in the bulk gap of bi@xmath0se@xmath1-family materials derived from microscopic band calculation.@xcite    the landau levels are broadened due to impurity , phonon and electron - electron scatterings . we model the imaginary part of the retarded green s function , or the density - of - states , of the broadened landau level @xmath120 ( written for `` + ' ' -branch and @xmath104 levels ) , using a gaussian - type form:@xcite @xmath121,\ ] ] with a half - width @xmath122 of the form:@xcite @xmath123^{1/2}$ ] . here @xmath124 is the single - particle lifetime and @xmath125 is the cyclotron frequency of linear - energy - dispersion system with @xmath126 being the zero - temperature fermi level . using a semi - empirical parameter @xmath127 to relate @xmath124 with the transport scattering time @xmath128 , and expressing @xmath129 with the zero - field mobility @xmath5 at finite temperature,@xcite we can write the landau - level broadening as @xmath130^{1/2}.\ ] ]    in the present study we consider the case of @xmath120-doping , i.e. the fermi level is high enough above the energy zero of the dirac cone in the range of `` + ' ' -branch levels and the states of `` @xmath100''-branch levels are completely filled , that they are irrelevant to electron transport . special attention has to be paid to the @xmath104 level , since , depending on the direction of exchange potential the effective g - factor of a ti surface state , @xmath30 , can be positive , zero or negative.@xcite the sign and magnitude of the effective g - factor determines how many states of the zero level should be included in or excluded from the available states for electron occupation in the case of @xmath120-doping at a magnetic field . ( i ) if @xmath131 , the @xmath104 level center is exactly at @xmath132 and the system is electron - hole symmetric . the total number of negative energy states ( including the states of the lower half of the @xmath104 level and states of the @xmath100"-branch levels ) and that of positive energy states ( including the states of the upper half of the @xmath104 level and states of the @xmath99"-branch levels ) do not change when changing magnetic field . therefore , the lower - half negative energy states of this level are always filled and the upper - half positive - energy states of it are available for the occupation of particles which are counted as electrons participating in transport in the case of @xmath120-doping . ( ii ) for a finite positive @xmath133 , the @xmath104 level @xmath134 moves downward to negative energy and its distance to the nearest  @xmath100"-branch level is @xmath135 closer than to the nearest  + " -branch level at finite magnetic field strength @xmath2 . this is equivalent to the opening of an increasingly enlarged ( with increasing @xmath2 ) energy gap between the  + " -branch states and the states of the zero - level and the  @xmath100"-branch levels . the opening of a sufficient energy gap implies that with increasing magnetic field the states in the  + " -branch levels would no longer shrink into the zero - level , and thus the @xmath104 level should be completely excluded from the conduction band , i.e. only particles occupying the  + " -branch states are counted as electrons participating in transport in the case of @xmath120-doping , when the magnetic field @xmath2 gets larger than a certain value ( depending on the magnitude of @xmath30 ) . ( iii ) for a finite negative @xmath136 , the @xmath104 level @xmath134 moves upward to positive energy and an increasingly enlarged energy gap will be opened between the states of the zero - level and the  + " -branch and the states of  @xmath100"-branch levels , and particles occupying the @xmath104 level and  + " -branch states are electrons participating in transport when the magnetic field @xmath2 gets larger than a certain value .    as a result , the experimentally accessible sheet density @xmath33 of electrons participating in transport is related to the fermi energy @xmath137 by the following equation valid at finite @xmath30 for the magnetic field @xmath2 larger than a certain value : @xmath138 in which @xmath139 + 1\}^{-1}$ ] is the fermi distribution function at temperature @xmath76 and the summation index @xmath120 goes over @xmath140 for @xmath133 , or @xmath141 for @xmath136 . in the case of @xmath131 , @xmath142\ ] ] valid for arbitrary magnetic field , in which @xmath143 . the imaginary part of relative - electron density correlation function in the presence of a magnetic field , @xmath86 , can be expressed in the landau representation as@xcite @xmath144 in which the transform factor @xmath145 ^ 2,\end{aligned}\ ] ] with @xmath146 , @xmath147 , @xmath148 , and @xmath149 being associated laguerre polynomials . the landau - representation correlation function @xmath150 in eq.([piqw ] ) can be constructed with the imaginary part of the retarded green s function @xmath151 , or the density - of - states , of the @xmath120th landau level as@xcite @xmath152\nonumber\\ & \hspace{1.2cm}\times{\rm im}g_n(\epsilon+\omega){\rm im}g_{n'}(\epsilon).\end{aligned}\ ] ] the summation indices @xmath120 and @xmath153 in eq.([piqw ] ) are taken over @xmath140 for @xmath133 , or @xmath154 for @xmath136 . in the case of @xmath131 , eq.([piqw ] ) still works and the summation indices @xmath120 and @xmath153 go over @xmath154 but with @xmath155 replaced by @xmath156 in eq.([p2nn ] ) . numerical calculations are performed for the magnetoresistivity @xmath157 of surface state in a uniform ti bi@xmath0se@xmath1 . at zero temperature the elastic scattering contributing to the resistivity is modeled by a coulomb potential due to charged impurities:@xcite @xmath158 with @xmath159 being the impurity density , which is determined by the zero - magnetic - field mobility @xmath5 . at temperatures higher than @xmath160,@xcite phonon scatterings play increasingly important role and the dominant inelastic contribution comes from optical phonons . for this polar material , the scattering by optical phonons via the deformation potential can be neglected . hence , we take account of inelastic scattering from optical phonons via frhlich coupling : @xmath161 . in the numerical calculation we use the following parameters:@xcite fermi velocity @xmath162 , static dielectric constant @xmath163 , optical dielectric constant @xmath164 , and phonon energy @xmath165 . the broadening parameter is taken to be @xmath166 . as a function of the magnetic field @xmath2 having different effective g - factors : @xmath167 and @xmath168 for a ti surface system with electron sheet density @xmath169 in the cases of zero - magnetic - field mobility @xmath170 ( a ) and @xmath171 ( b ) . several integer - number positions of filling factor @xmath172 are marked in ( b).,scaledwidth=40.0% ]    fig.[diffg ] shows the calculated magnetoresistivity @xmath157 versus the magnetic field strength @xmath2 for a ti surface system with electron sheet density @xmath169 but having different effective g - factors : @xmath167 and @xmath168 for two values of zero - magnetic - field mobility @xmath170 and @xmath171 , representing different degree of landau - level broadening . in the case without zeeman splitting ( @xmath131 ) the resistivity @xmath157 exhibits almost no change with changing magnetic field up to 10 t , except the shubnikov - de haas ( sdh ) oscillation showing up in the case of @xmath171 . this kind of magnetoresistance behavior was indeed seen experimentally in the electron - hole symmetrical massless system of single - layer graphene.@xcite in the case of a positive g - factor , @xmath173 , the magnetoresistivity increases linearly with increasing magnetic field ; while for a negative g - factor , @xmath174 , the magnetoresistivity decreases linearly with increasing magnetic field . is shown as a function of the magnetic field @xmath2 for different values of zero - magnetic - field mobility : ( a ) @xmath175 , ( b ) @xmath176 , ( c ) @xmath177 , ( d ) @xmath178 , ( e ) @xmath179 , and ( f ) @xmath180 . the inset of ( a ) illustrates the same for a larger magnetic - field range @xmath181 . the filling factor @xmath182 is plotted versus the magnetic field in ( f ) ; and several integer - number positions of @xmath182 are also marked in ( d ) and ( e ) . here the surface electron density @xmath169 and the lattice temperature @xmath183.,scaledwidth=47.0% ]    in the following we will give more detailed examination on the linearly increasing magnetoresistance in the positive @xmath30 case . fig.[rhob ] shows the calculated resistivity @xmath157 versus the magnetic field strength @xmath2 at lattice temperature @xmath183 for system of carrier sheet density @xmath169 and @xmath173 , having different zero - field mobility @xmath184 and @xmath180 . all resistivity curves for mobility @xmath185 exhibit clear linearity in the magnetic - field range and appear no tendency of saturation at the highest field shown in the figure . especially , for the case @xmath170 , the linear behavior extends even up to the magnetic field of @xmath186 , as illustrated in the inset of fig.[rhob](a ) . this feature contradicts the classical mr which saturates at sufficiently large magnetic field @xmath187 . note that here we only present the calculated @xmath157 for magnetic field @xmath2 larger than @xmath188 t , for which a sufficient energy gap @xmath135 is assumed to open that with further increase of the magnetic field the states in the `` + ' ' -branch levels no longer shrink into the zero level and thus it should be excluded from the conduction band . this is of course not true for very weak magnetic field . when @xmath189 the energy gap @xmath190 , the situation becomes similar to the case of @xmath131 : the whole upper half of the zero - level states are available to electron occupation and we should have a flat resistivity @xmath157 when changing magnetic field . with increasing @xmath2 the portion of the zero - level states available to conduction electrons decreases until the magnetic field reaches @xmath191 . as a result the resistivity @xmath157 should exhibit a crossover from a flat changing at small @xmath2 to positively linear increasing at @xmath192 . this is just the behavior observed in the ti bi@xmath0se@xmath1.@xcite    note that in the case of @xmath170 , the broadened landau - level widths are always larger than the neighboring level interval : @xmath193 , which requires @xmath194 ^ 2 $ ] , even for the lowest landau level @xmath195 , i.e. the whole landau - level spectrum is smeared . with increasing the zero - field mobility the magnitude of resistivity @xmath157 decreases , and when the broadened landau - level width becomes smaller than the neighboring level interval , @xmath196 , a weak sdh oscillation begin to occur around the linearly - dependent average value of @xmath157 at higher portion of the magnetic field range , as seen in fig.[rhob](c ) , ( d ) and ( e ) for @xmath197 and @xmath198 . on the other hand , in the case of large mobility , e.g. @xmath199 , where the broadened landau - level widths @xmath200 are much smaller than the neighboring level interval even for level index @xmath120 as large as @xmath201 , the magnetoresistivity shows pronounced sdh oscillation and the linear - dependent behavior disappears , before the appearance of quantum hall effect,@xcite as shown in fig.[rhob](f ) . abrikosov s model for the lmr requires the applied magnetic field large enough to reach the quantum limit at which all the carriers are within the lowest landau level,@xcite while it is obvious that more than one landau levels are occupied in the experimental samples in the field range in which the linear and non - saturating magnetoresistivity was observed.@xcite for the given electron surface density @xmath202 , the number of occupied landau levels , or the filling factor @xmath172 , at different magnetic fields is shown in fig.[rhob](f ) , as well as in the fig.[rhob](d ) and ( e ) , where the integer - number positions of @xmath203 , i.e. filling up to entire @xmath182 landau levels , coincide with the minima of the density - of - states or the dips of sdh oscillation . this is in contrast with @xmath131 case , where the integer number of @xmath203 , which implies a filling up to the center position of the @xmath182th landau levels , locates at a peak of sdh oscillation , as shown in fig.[diffg]b . the observed sdh oscillations in the bi@xmath0se@xmath1 nanoribbon exhibiting nonsaturating surface lmr in the experiment@xcite favor the former case : a finite positive effective @xmath133 .     is plotted as a function of the surface electron density @xmath33 at magnetic field @xmath204 : ( a ) at different values of zero - field mobility @xmath5 , and ( b ) at different values of zero - field conductivity @xmath205.,scaledwidth=40.0% ]     at various lattice temperatures . here the zero - magnetic - field mobility at zero temperature is @xmath206.,scaledwidth=35.0% ]    next , we examine the density - dependence of the linear magnetoresistivity . to compare with abrikosov s quantum magnetoresistance which suggests a @xmath207 behavior,@xcite we show the calculated @xmath208 for above lmr versus the carrier sheet density @xmath33 in fig.[rhon ] at fixed magnetic field @xmath209 t . the mobility is taken respectively to be @xmath210 and @xmath211m@xmath212/vs to make the resistivity in the lmr regime . a clearly linear dependence of @xmath213 on the surface density @xmath33 is seen in all cases , indicating that this non - saturating linear resistivity is almost inversely proportional to the carrier density . in the figure we also show @xmath208 versus @xmath33 under the condition of different given conductivity @xmath214 and @xmath215 . in this case the half - width @xmath216 is independent of surface density . the linear dependence still holds , indicating that this linear behavior is not sensitive to the modest @xmath33-dependence of landau level broadening @xmath216 as long as the system is in the overlapped landau level regime . from the above discussion , it is obvious that lmr shows up in the system having overlapped landau levels and the separation of landau levels makes the mr departure from the linear increase . at high temperature , the thermal energy would smear the level separation and phonon scatterings further broaden landau levels . hence , it is believed that this lmr will be robust against raising temperature . this is indeed the case as seen in fig.[rhot ] , where we plot the calculated magnetoresistivity @xmath157 for the above system with zero - temperature linear mobility @xmath217m@xmath212/vs versus the magnetic field at different lattice temperatures . we can see that raising temperature to room temperature has little effect on the linearity of mr . due to the decreased mobility at higher temperature from phonon scattering , the weak sdh oscillation on the linear background tends to vanish . these features are in good agreement with the experimental report.@xcite in summary , we have studied the two - dimensional magnetotransport in the flat surface of a three - dimensional ti , which arises from the surface states with a wavevector - linear energy dispersion and a finite , positive zeeman splitting within the bulk energy gap . when the level broadening is comparable to or larger than the landau - level separation and the conduction electrons spread over many landau levels , a positive , dominantly linear and non - saturating magnetoresistance appears within a quite wide range of magnetic field and persists up to room temperature . this remarkable lmr provides a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite    in contrast to quantum hall effect which appears in the case of well formed landau levels and to abrikosov s quantum magnetotransport,@xcite which is limited to the extreme quantum limit that all electrons coalesce into the lowest landau level , the discussed lmr is a phenomena of pure classical two - dimensional magnetotransport in a system having linear - energy - dispersion , appearing in the regime of overlapped landau levels , irrespective of its showing up in relatively high magnetic field range . furthermore , the present scheme deals with spatially uniform case without invoking the mobility fluctuation in a strongly inhomogeneous system , which is required in the classical parish and littlewood model to produce a lmr.@xcite    the appearance of this significant positive - increasing linear magnetoresistance depends on the existence of a positive and sizable effective g - factor . if the zeeman energy splitting is quite small the resistivity @xmath157 would exhibit little change with changing magnetic field . in the case of a negative and sizable effective g - factor the magnetoresistivity would decrease linearly with increasing magnetic field . therefore , the behavior of the longitudinal resistivity versus magnetic field may provide a useful way for judging the direction and the size of the effective zeeman energy splitting in ti surface states . this work was supported by the national science foundation of china ( grant no . 11104002 ) , the national basic research program of china ( grant no . 2012cb927403 ) and by the program for science&technology innovation talents in universities of henan province ( grant no . 2012hastit029 ) ."""
+
+        dct = tok.batch_encode_plus(
+            [ARTICLE_LEP, ARTICLE_MAGNET],
+            max_length=6144,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=4,
+            max_length=512,
+            early_stopping=True,
+            no_repeat_ngram_size=3,
+        )
+
+        EXPECTED_LEP = " the physics of @xmath0-boson will again play the central role in the frontier of particle physics if the gigaz option of the international linear collider ( ilc ) can be realized in its first phase. \n the expected sensitivity to the branching ratio of the rare decays, especially its exotic or rare processes, should be investigated comprehensively to evaluate their potential in probing new physics. in this work \n, we extend the previous studies of these decays to some new models and investigate the decays altogether. we are motivated by some recent studies on the singlet extension of the mssm, such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly - minimal - supersymmetry - standard - model(nmssm)@xcite, where a light cp - odd higgs boson with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry.    # 1#2#3#4#5#6#7#8#9#10#11#12 "
+
+        EXPECTED_MAGNET = " the recent experiment in the surface states of the topological insulator bi@xmath0se @xmath1, however, reported that a large positive magnetoresistance becomes very linear in perpendicular magnetic field even in an opposite situation where the carrier sheet density is high that all electrons occupy more than one landau levels. \n it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model. "
+
+        generated = tok.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated == [EXPECTED_LEP, EXPECTED_MAGNET]
diff --git a/test_modeling_longformer.py b/test_modeling_longformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d5eee162661871c53a9bde17bcf9d34c62896d
--- /dev/null
+++ b/test_modeling_longformer.py
@@ -0,0 +1,707 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        LongformerConfig,
+        LongformerForMaskedLM,
+        LongformerForMultipleChoice,
+        LongformerForQuestionAnswering,
+        LongformerForSequenceClassification,
+        LongformerForTokenClassification,
+        LongformerModel,
+        LongformerSelfAttention,
+    )
+
+
+class LongformerModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.attention_window = 4
+
+        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
+        # [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
+        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
+        # because its local attention only attends to `self.attention_window + 1` locations
+        # (assuming no token with global attention, otherwise the last dimension of attentions
+        # is x + self.attention_window + 1, where x is the number of tokens with global attention)
+        self.key_length = self.attention_window + 1
+
+        # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
+        # the `test_attention_outputs` and `test_hidden_states_output` tests
+        self.encoder_seq_length = (
+            self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window
+        )
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = LongformerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            attention_window=self.attention_window,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_attention_mask_determinism(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LongformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        output_with_mask = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
+        output_without_mask = model(input_ids)["last_hidden_state"]
+        self.parent.assertTrue(torch.allclose(output_with_mask[0, 0, :5], output_without_mask[0, 0, :5], atol=1e-4))
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LongformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_global_attention_mask(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LongformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        global_attention_mask = input_mask.clone()
+        global_attention_mask[:, input_mask.shape[-1] // 2] = 0
+        global_attention_mask = global_attention_mask.to(torch_device)
+
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            global_attention_mask=global_attention_mask,
+            token_type_ids=token_type_ids,
+        )
+        result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask)
+        result = model(input_ids, global_attention_mask=global_attention_mask)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LongformerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = LongformerForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            global_attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = LongformerForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = LongformerForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = LongformerForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            global_attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        global_attention_mask = torch.zeros_like(input_ids)
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+            "global_attention_mask": global_attention_mask,
+        }
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_question_answering(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        # Replace sep_token_id by some random id
+        input_ids[input_ids == config.sep_token_id] = torch.randint(0, config.vocab_size, (1,)).item()
+        # Make sure there are exactly three sep_token_id
+        input_ids[:, -3:] = config.sep_token_id
+        input_mask = torch.ones_like(input_ids)
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+
+@require_torch
+class LongformerModelTest(ModelTesterMixin, unittest.TestCase):
+    test_pruning = False  # pruning is not supported
+    test_torchscript = False
+    test_sequence_classification_problem_types = True
+
+    all_model_classes = (
+        (
+            LongformerModel,
+            LongformerForMaskedLM,
+            LongformerForSequenceClassification,
+            LongformerForQuestionAnswering,
+            LongformerForTokenClassification,
+            LongformerForMultipleChoice,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    def setUp(self):
+        self.model_tester = LongformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_attention_mask_determinism(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_attention_mask_determinism(*config_and_inputs)
+
+    def test_model_global_attention_mask(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_global_attention_mask(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_question_answering()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # longformer cannot keep gradients in attentions or hidden states
+        return
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class LongformerModelIntegrationTest(unittest.TestCase):
+    def _get_hidden_states(self):
+        return torch.tensor(
+            [
+                [
+                    [
+                        4.98332758e-01,
+                        2.69175139e00,
+                        -7.08081422e-03,
+                        1.04915401e00,
+                        -1.83476661e00,
+                        7.67220476e-01,
+                        2.98580543e-01,
+                        2.84803992e-02,
+                    ],
+                    [
+                        -7.58357372e-01,
+                        4.20635998e-01,
+                        -4.04739919e-02,
+                        1.59924145e-01,
+                        2.05135748e00,
+                        -1.15997978e00,
+                        5.37166397e-01,
+                        2.62873606e-01,
+                    ],
+                    [
+                        -1.69438001e00,
+                        4.17574660e-01,
+                        -1.49196962e00,
+                        -1.76483717e00,
+                        -1.94566312e-01,
+                        -1.71183858e00,
+                        7.72903565e-01,
+                        -1.11557056e00,
+                    ],
+                    [
+                        5.44028163e-01,
+                        2.05466114e-01,
+                        -3.63045868e-01,
+                        2.41865062e-01,
+                        3.20348382e-01,
+                        -9.05611176e-01,
+                        -1.92690727e-01,
+                        -1.19917547e00,
+                    ],
+                ]
+            ],
+            dtype=torch.float32,
+            device=torch_device,
+        )
+
+    def test_diagonalize(self):
+        hidden_states = self._get_hidden_states()
+        hidden_states = hidden_states.reshape((1, 8, 4))  # set seq length = 8, hidden dim = 4
+        chunked_hidden_states = LongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+        window_overlap_size = chunked_hidden_states.shape[2]
+        self.assertTrue(window_overlap_size == 4)
+
+        padded_hidden_states = LongformerSelfAttention._pad_and_diagonalize(chunked_hidden_states)
+
+        self.assertTrue(padded_hidden_states.shape[-1] == chunked_hidden_states.shape[-1] + window_overlap_size - 1)
+
+        # first row => [0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000]
+        self.assertTrue(torch.allclose(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], atol=1e-3))
+        self.assertTrue(
+            torch.allclose(
+                padded_hidden_states[0, 0, 0, 4:],
+                torch.zeros((3,), device=torch_device, dtype=torch.float32),
+                atol=1e-3,
+            )
+        )
+        # last row => [0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629]
+        self.assertTrue(torch.allclose(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], atol=1e-3))
+        self.assertTrue(
+            torch.allclose(
+                padded_hidden_states[0, 0, -1, :3],
+                torch.zeros((3,), device=torch_device, dtype=torch.float32),
+                atol=1e-3,
+            )
+        )
+
+    def test_pad_and_transpose_last_two_dims(self):
+        hidden_states = self._get_hidden_states()
+        self.assertTrue(hidden_states.shape, (1, 8, 4))
+        padding = (0, 0, 0, 1)
+
+        padded_hidden_states = LongformerSelfAttention._pad_and_transpose_last_two_dims(hidden_states, padding)
+        self.assertTrue(padded_hidden_states.shape, (1, 8, 5))
+
+        expected_added_dim = torch.zeros((5,), device=torch_device, dtype=torch.float32)
+        self.assertTrue(torch.allclose(expected_added_dim, padded_hidden_states[0, -1, :], atol=1e-6))
+        self.assertTrue(torch.allclose(hidden_states[0, -1, :], padded_hidden_states.view(1, -1)[0, 24:32], atol=1e-6))
+
+    def test_chunk(self):
+        hidden_states = self._get_hidden_states()
+        batch_size = 1
+        seq_length = 8
+        hidden_size = 4
+        hidden_states = hidden_states.reshape((batch_size, seq_length, hidden_size))
+
+        chunked_hidden_states = LongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+
+        # expected slices across chunk and seq length dim
+        expected_slice_along_seq_length = torch.tensor(
+            [0.4983, -0.7584, -1.6944], device=torch_device, dtype=torch.float32
+        )
+        expected_slice_along_chunk = torch.tensor(
+            [0.4983, -1.8348, -0.7584, 2.0514], device=torch_device, dtype=torch.float32
+        )
+
+        self.assertTrue(torch.allclose(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, atol=1e-3))
+        self.assertTrue(torch.allclose(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, atol=1e-3))
+        self.assertTrue(chunked_hidden_states.shape, (1, 3, 4, 4))
+
+    def test_mask_invalid_locations(self):
+        hidden_states = self._get_hidden_states()
+
+        batch_size = 1
+        seq_length = 8
+        hidden_size = 4
+        hidden_states = hidden_states.reshape((batch_size, seq_length, hidden_size))
+        chunked_hidden_states = LongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+
+        hid_states_1 = chunked_hidden_states.clone()
+        LongformerSelfAttention._mask_invalid_locations(hid_states_1, 1)
+        self.assertTrue(torch.isinf(hid_states_1).sum().item() == 8)
+
+        hid_states_2 = chunked_hidden_states.clone()
+        LongformerSelfAttention._mask_invalid_locations(hid_states_2, 2)
+        self.assertTrue(torch.isinf(hid_states_2).sum().item() == 24)
+
+        hid_states_3 = chunked_hidden_states.clone()[:, :, :, :3]
+        LongformerSelfAttention._mask_invalid_locations(hid_states_3, 2)
+        self.assertTrue(torch.isinf(hid_states_3).sum().item() == 24)
+
+        hid_states_4 = chunked_hidden_states.clone()[:, :, 2:, :]
+        LongformerSelfAttention._mask_invalid_locations(hid_states_4, 2)
+        self.assertTrue(torch.isinf(hid_states_4).sum().item() == 12)
+
+    def test_layer_local_attn(self):
+        model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
+        model.eval()
+        layer = model.encoder.layer[0].attention.self.to(torch_device)
+        hidden_states = self._get_hidden_states()
+        batch_size, seq_length, hidden_size = hidden_states.size()
+        attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device)
+        attention_mask[:, -2:] = -10000
+
+        is_index_masked = attention_mask < 0
+        is_index_global_attn = attention_mask > 0
+        is_global_attn = is_index_global_attn.flatten().any().item()
+
+        output_hidden_states = layer(
+            hidden_states,
+            attention_mask=attention_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+        )[0]
+
+        self.assertTrue(output_hidden_states.shape, (1, 4, 8))
+        self.assertTrue(
+            torch.allclose(
+                output_hidden_states[0, 1],
+                torch.tensor(
+                    [0.0019, 0.0122, -0.0171, -0.0256, -0.0300, 0.0173, -0.0115, 0.0048],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+    def test_layer_global_attn(self):
+        model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
+        model.eval()
+        layer = model.encoder.layer[0].attention.self.to(torch_device)
+        hidden_states = torch.cat([self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0)
+        batch_size, seq_length, hidden_size = hidden_states.size()
+        attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device)
+
+        # create attn mask
+        attention_mask[0, -2:] = 10000.0
+        attention_mask[0, -1:] = -10000.0
+        attention_mask[1, 1:] = 10000.0
+
+        is_index_masked = attention_mask < 0
+        is_index_global_attn = attention_mask > 0
+        is_global_attn = is_index_global_attn.flatten().any().item()
+
+        output_hidden_states = layer(
+            hidden_states,
+            attention_mask=attention_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+        )[0]
+
+        self.assertTrue(output_hidden_states.shape, (2, 4, 8))
+
+        self.assertTrue(
+            torch.allclose(
+                output_hidden_states[0, 2],
+                torch.tensor(
+                    [-0.0651, -0.0393, 0.0309, -0.0342, -0.0066, -0.0155, -0.0209, -0.0494],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+        self.assertTrue(
+            torch.allclose(
+                output_hidden_states[1, -2],
+                torch.tensor(
+                    [-0.0405, -0.0384, 0.0396, -0.0374, -0.0341, 0.0136, 0.0014, -0.0571],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+    def test_layer_attn_probs(self):
+        model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
+        model.eval()
+        layer = model.encoder.layer[0].attention.self.to(torch_device)
+        hidden_states = torch.cat([self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0)
+        batch_size, seq_length, hidden_size = hidden_states.size()
+        attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device)
+
+        # create attn mask
+        attention_mask[0, -2:] = 10000.0
+        attention_mask[0, -1:] = -10000.0
+        attention_mask[1, 1:] = 10000.0
+
+        is_index_masked = attention_mask < 0
+        is_index_global_attn = attention_mask > 0
+        is_global_attn = is_index_global_attn.flatten().any().item()
+
+        output_hidden_states, local_attentions, global_attentions = layer(
+            hidden_states,
+            attention_mask=attention_mask,
+            is_index_masked=is_index_masked,
+            is_index_global_attn=is_index_global_attn,
+            is_global_attn=is_global_attn,
+            output_attentions=True,
+        )
+
+        self.assertEqual(local_attentions.shape, (2, 4, 2, 8))
+        self.assertEqual(global_attentions.shape, (2, 2, 3, 4))
+
+        # All tokens with global attention have weight 0 in local attentions.
+        self.assertTrue(torch.all(local_attentions[0, 2:4, :, :] == 0))
+        self.assertTrue(torch.all(local_attentions[1, 1:4, :, :] == 0))
+
+        # The weight of all tokens with local attention must sum to 1.
+        self.assertTrue(torch.all(torch.abs(global_attentions[0, :, :2, :].sum(dim=-1) - 1) < 1e-6))
+        self.assertTrue(torch.all(torch.abs(global_attentions[1, :, :1, :].sum(dim=-1) - 1) < 1e-6))
+
+        self.assertTrue(
+            torch.allclose(
+                local_attentions[0, 0, 0, :],
+                torch.tensor(
+                    [0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+        self.assertTrue(
+            torch.allclose(
+                local_attentions[1, 0, 0, :],
+                torch.tensor(
+                    [0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+        # All the global attention weights must sum to 1.
+        self.assertTrue(torch.all(torch.abs(global_attentions.sum(dim=-1) - 1) < 1e-6))
+
+        self.assertTrue(
+            torch.allclose(
+                global_attentions[0, 0, 1, :],
+                torch.tensor(
+                    [0.2500, 0.2500, 0.2500, 0.2500],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+        self.assertTrue(
+            torch.allclose(
+                global_attentions[1, 0, 0, :],
+                torch.tensor(
+                    [0.2497, 0.2500, 0.2499, 0.2504],
+                    dtype=torch.float32,
+                    device=torch_device,
+                ),
+                atol=1e-3,
+            )
+        )
+
+    @slow
+    def test_inference_no_head(self):
+        model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
+        model.to(torch_device)
+
+        # 'Hello world!'
+        input_ids = torch.tensor([[0, 20920, 232, 328, 1437, 2]], dtype=torch.long, device=torch_device)
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        output_without_mask = model(input_ids)[0]
+
+        expected_output_slice = torch.tensor([0.0549, 0.1087, -0.1119, -0.0368, 0.0250], device=torch_device)
+        self.assertTrue(torch.allclose(output[0, 0, -5:], expected_output_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(output_without_mask[0, 0, -5:], expected_output_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head_long(self):
+        model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
+        model.to(torch_device)
+
+        # 'Hello world! ' repeated 1000 times
+        input_ids = torch.tensor(
+            [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=torch.long, device=torch_device
+        )  # long input
+
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
+        global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device)
+        global_attention_mask[:, [1, 4, 21]] = 1  # Set global attention on a few random positions
+
+        output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0]
+
+        expected_output_sum = torch.tensor(74585.8594, device=torch_device)
+        expected_output_mean = torch.tensor(0.0243, device=torch_device)
+        self.assertTrue(torch.allclose(output.sum(), expected_output_sum, atol=1e-4))
+        self.assertTrue(torch.allclose(output.mean(), expected_output_mean, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm_long(self):
+        model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+        model.to(torch_device)
+
+        # 'Hello world! ' repeated 1000 times
+        input_ids = torch.tensor(
+            [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=torch.long, device=torch_device
+        )  # long input
+        input_ids = input_ids.to(torch_device)
+
+        loss, prediction_scores = model(input_ids, labels=input_ids).to_tuple()
+
+        expected_loss = torch.tensor(0.0074, device=torch_device)
+        expected_prediction_scores_sum = torch.tensor(-6.1048e08, device=torch_device)
+        expected_prediction_scores_mean = torch.tensor(-3.0348, device=torch_device)
+
+        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4))
+        self.assertTrue(torch.allclose(prediction_scores.sum(), expected_prediction_scores_sum, atol=1e-4))
+        self.assertTrue(torch.allclose(prediction_scores.mean(), expected_prediction_scores_mean, atol=1e-4))
diff --git a/test_modeling_luke.py b/test_modeling_luke.py
new file mode 100644
index 0000000000000000000000000000000000000000..1343da5ce2b804f3169ca99919fe42c945ba84d1
--- /dev/null
+++ b/test_modeling_luke.py
@@ -0,0 +1,609 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch LUKE model. """
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        LukeConfig,
+        LukeForEntityClassification,
+        LukeForEntityPairClassification,
+        LukeForEntitySpanClassification,
+        LukeModel,
+        LukeTokenizer,
+    )
+    from transformers.models.luke.modeling_luke import LUKE_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class LukeModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        entity_length=3,
+        mention_length=5,
+        use_attention_mask=True,
+        use_token_type_ids=True,
+        use_entity_ids=True,
+        use_entity_attention_mask=True,
+        use_entity_token_type_ids=True,
+        use_entity_position_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        entity_vocab_size=10,
+        entity_emb_size=6,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_entity_classification_labels=9,
+        num_entity_pair_classification_labels=6,
+        num_entity_span_classification_labels=4,
+        use_entity_aware_attention=True,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.entity_length = entity_length
+        self.mention_length = mention_length
+        self.use_attention_mask = use_attention_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_entity_ids = use_entity_ids
+        self.use_entity_attention_mask = use_entity_attention_mask
+        self.use_entity_token_type_ids = use_entity_token_type_ids
+        self.use_entity_position_ids = use_entity_position_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.entity_vocab_size = entity_vocab_size
+        self.entity_emb_size = entity_emb_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_entity_classification_labels = num_entity_classification_labels
+        self.num_entity_pair_classification_labels = num_entity_pair_classification_labels
+        self.num_entity_span_classification_labels = num_entity_span_classification_labels
+        self.scope = scope
+        self.use_entity_aware_attention = use_entity_aware_attention
+
+        self.encoder_seq_length = seq_length
+        self.key_length = seq_length
+        self.num_hidden_states_types = 2  # hidden_states and entity_hidden_states
+
+    def prepare_config_and_inputs(self):
+        # prepare words
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        # prepare entities
+        entity_ids = ids_tensor([self.batch_size, self.entity_length], self.entity_vocab_size)
+
+        entity_attention_mask = None
+        if self.use_entity_attention_mask:
+            entity_attention_mask = random_attention_mask([self.batch_size, self.entity_length])
+
+        entity_token_type_ids = None
+        if self.use_token_type_ids:
+            entity_token_type_ids = ids_tensor([self.batch_size, self.entity_length], self.type_vocab_size)
+
+        entity_position_ids = None
+        if self.use_entity_position_ids:
+            entity_position_ids = ids_tensor(
+                [self.batch_size, self.entity_length, self.mention_length], self.mention_length
+            )
+
+        sequence_labels = None
+        entity_classification_labels = None
+        entity_pair_classification_labels = None
+        entity_span_classification_labels = None
+
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            entity_classification_labels = ids_tensor([self.batch_size], self.num_entity_classification_labels)
+            entity_pair_classification_labels = ids_tensor(
+                [self.batch_size], self.num_entity_pair_classification_labels
+            )
+            entity_span_classification_labels = ids_tensor(
+                [self.batch_size, self.entity_length], self.num_entity_span_classification_labels
+            )
+
+        config = LukeConfig(
+            vocab_size=self.vocab_size,
+            entity_vocab_size=self.entity_vocab_size,
+            entity_emb_size=self.entity_emb_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            use_entity_aware_attention=self.use_entity_aware_attention,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            entity_ids,
+            entity_attention_mask,
+            entity_token_type_ids,
+            entity_position_ids,
+            sequence_labels,
+            entity_classification_labels,
+            entity_pair_classification_labels,
+            entity_span_classification_labels,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        entity_ids,
+        entity_attention_mask,
+        entity_token_type_ids,
+        entity_position_ids,
+        sequence_labels,
+        entity_classification_labels,
+        entity_pair_classification_labels,
+        entity_span_classification_labels,
+    ):
+        model = LukeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        # test with words + entities
+        result = model(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            entity_ids=entity_ids,
+            entity_attention_mask=entity_attention_mask,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_position_ids=entity_position_ids,
+        )
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(
+            result.entity_last_hidden_state.shape, (self.batch_size, self.entity_length, self.hidden_size)
+        )
+
+        # test with words only
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_entity_classification(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        entity_ids,
+        entity_attention_mask,
+        entity_token_type_ids,
+        entity_position_ids,
+        sequence_labels,
+        entity_classification_labels,
+        entity_pair_classification_labels,
+        entity_span_classification_labels,
+    ):
+        config.num_labels = self.num_entity_classification_labels
+        model = LukeForEntityClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            entity_ids=entity_ids,
+            entity_attention_mask=entity_attention_mask,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_position_ids=entity_position_ids,
+            labels=entity_classification_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_entity_classification_labels))
+
+    def create_and_check_for_entity_pair_classification(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        entity_ids,
+        entity_attention_mask,
+        entity_token_type_ids,
+        entity_position_ids,
+        sequence_labels,
+        entity_classification_labels,
+        entity_pair_classification_labels,
+        entity_span_classification_labels,
+    ):
+        config.num_labels = self.num_entity_pair_classification_labels
+        model = LukeForEntityClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            entity_ids=entity_ids,
+            entity_attention_mask=entity_attention_mask,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_position_ids=entity_position_ids,
+            labels=entity_pair_classification_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_entity_pair_classification_labels))
+
+    def create_and_check_for_entity_span_classification(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        entity_ids,
+        entity_attention_mask,
+        entity_token_type_ids,
+        entity_position_ids,
+        sequence_labels,
+        entity_classification_labels,
+        entity_pair_classification_labels,
+        entity_span_classification_labels,
+    ):
+        config.num_labels = self.num_entity_span_classification_labels
+        model = LukeForEntitySpanClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        entity_start_positions = ids_tensor([self.batch_size, self.entity_length], self.seq_length)
+        entity_end_positions = ids_tensor([self.batch_size, self.entity_length], self.seq_length)
+
+        result = model(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            entity_ids=entity_ids,
+            entity_attention_mask=entity_attention_mask,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_position_ids=entity_position_ids,
+            entity_start_positions=entity_start_positions,
+            entity_end_positions=entity_end_positions,
+            labels=entity_span_classification_labels,
+        )
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.entity_length, self.num_entity_span_classification_labels)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            entity_ids,
+            entity_attention_mask,
+            entity_token_type_ids,
+            entity_position_ids,
+            sequence_labels,
+            entity_classification_labels,
+            entity_pair_classification_labels,
+            entity_span_classification_labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask,
+            "entity_ids": entity_ids,
+            "entity_token_type_ids": entity_token_type_ids,
+            "entity_attention_mask": entity_attention_mask,
+            "entity_position_ids": entity_position_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class LukeModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            LukeModel,
+            LukeForEntityClassification,
+            LukeForEntityPairClassification,
+            LukeForEntitySpanClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = True
+    test_head_masking = True
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        if model_class == LukeForEntitySpanClassification:
+            inputs_dict["entity_start_positions"] = torch.zeros(
+                (self.model_tester.batch_size, self.model_tester.entity_length), dtype=torch.long, device=torch_device
+            )
+            inputs_dict["entity_end_positions"] = torch.ones(
+                (self.model_tester.batch_size, self.model_tester.entity_length), dtype=torch.long, device=torch_device
+            )
+
+        if return_labels:
+            if model_class in (LukeForEntityClassification, LukeForEntityPairClassification):
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class == LukeForEntitySpanClassification:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.entity_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = LukeModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LukeConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in LUKE_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = LukeModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_for_entity_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_entity_classification(*config_and_inputs)
+
+    def test_for_entity_pair_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_entity_pair_classification(*config_and_inputs)
+
+    def test_for_entity_span_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_entity_span_classification(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_length = self.model_tester.seq_length
+        entity_length = self.model_tester.entity_length
+        key_length = seq_length + entity_length
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length + entity_length, key_length],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = self.model_tester.num_hidden_states_types
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length + entity_length, key_length],
+            )
+
+    def test_entity_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            entity_hidden_states = outputs.entity_hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(entity_hidden_states), expected_num_layers)
+
+            entity_length = self.model_tester.entity_length
+
+            self.assertListEqual(
+                list(entity_hidden_states[0].shape[-2:]),
+                [entity_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_retain_grad_entity_hidden_states(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        output = outputs[0]
+
+        entity_hidden_states = outputs.entity_hidden_states[0]
+        entity_hidden_states.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(entity_hidden_states.grad)
+
+
+@require_torch
+class LukeModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference_base_model(self):
+        model = LukeModel.from_pretrained("studio-ousia/luke-base").eval()
+        model.to(torch_device)
+
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", task="entity_classification")
+        text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
+        span = (39, 42)
+        encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
+
+        # move all values to device
+        for key, value in encoding.items():
+            encoding[key] = encoding[key].to(torch_device)
+
+        outputs = model(**encoding)
+
+        # Verify word hidden states
+        expected_shape = torch.Size((1, 42, 768))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+        # Verify entity hidden states
+        expected_shape = torch.Size((1, 1, 768))
+        self.assertEqual(outputs.entity_last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_large_model(self):
+        model = LukeModel.from_pretrained("studio-ousia/luke-large").eval()
+        model.to(torch_device)
+
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large", task="entity_classification")
+        text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
+        span = (39, 42)
+        encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
+
+        # move all values to device
+        for key, value in encoding.items():
+            encoding[key] = encoding[key].to(torch_device)
+
+        outputs = model(**encoding)
+
+        # Verify word hidden states
+        expected_shape = torch.Size((1, 42, 1024))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+        # Verify entity hidden states
+        expected_shape = torch.Size((1, 1, 1024))
+        self.assertEqual(outputs.entity_last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_lxmert.py b/test_modeling_lxmert.py
new file mode 100644
index 0000000000000000000000000000000000000000..451db8089a5ada5b88a2c26617b621ebca956184
--- /dev/null
+++ b/test_modeling_lxmert.py
@@ -0,0 +1,729 @@
+# coding=utf-8
+# Copyright 2018 LXMERT Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        LxmertConfig,
+        LxmertForPreTraining,
+        LxmertForQuestionAnswering,
+        LxmertModel,
+    )
+    from transformers.models.lxmert.modeling_lxmert import LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class LxmertModelTester:
+    """You can also import this e.g from .test_modeling_bart import BartModelTester"""
+
+    def __init__(
+        self,
+        parent,
+        vocab_size=300,
+        hidden_size=28,
+        num_attention_heads=2,
+        num_labels=2,
+        intermediate_size=64,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        num_qa_labels=30,
+        num_object_labels=16,
+        num_attr_labels=4,
+        num_visual_features=10,
+        l_layers=2,
+        x_layers=1,
+        r_layers=1,
+        visual_feat_dim=128,
+        visual_pos_dim=4,
+        visual_loss_normalizer=6.67,
+        seq_length=20,
+        batch_size=4,
+        is_training=True,
+        task_matched=True,
+        task_mask_lm=True,
+        task_obj_predict=True,
+        task_qa=True,
+        visual_obj_loss=True,
+        visual_attr_loss=True,
+        visual_feat_loss=True,
+        use_token_type_ids=True,
+        use_lang_mask=True,
+        output_attentions=False,
+        output_hidden_states=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_labels = num_labels
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.pad_token_id = pad_token_id
+        self.num_qa_labels = num_qa_labels
+        self.num_object_labels = num_object_labels
+        self.num_attr_labels = num_attr_labels
+        self.l_layers = l_layers
+        self.x_layers = x_layers
+        self.r_layers = r_layers
+        self.visual_feat_dim = visual_feat_dim
+        self.visual_pos_dim = visual_pos_dim
+        self.visual_loss_normalizer = visual_loss_normalizer
+        self.seq_length = seq_length
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_lang_mask = use_lang_mask
+        self.task_matched = task_matched
+        self.task_mask_lm = task_mask_lm
+        self.task_obj_predict = task_obj_predict
+        self.task_qa = task_qa
+        self.visual_obj_loss = visual_obj_loss
+        self.visual_attr_loss = visual_attr_loss
+        self.visual_feat_loss = visual_feat_loss
+        self.num_visual_features = num_visual_features
+        self.use_token_type_ids = use_token_type_ids
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.scope = scope
+        self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
+
+    def prepare_config_and_inputs(self):
+
+        output_attentions = self.output_attentions
+        input_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=self.vocab_size)
+        visual_feats = torch.rand(self.batch_size, self.num_visual_features, self.visual_feat_dim, device=torch_device)
+        bounding_boxes = torch.rand(self.batch_size, self.num_visual_features, 4, device=torch_device)
+
+        input_mask = None
+        if self.use_lang_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        obj_labels = None
+        if self.task_obj_predict:
+            obj_labels = {}
+        if self.visual_attr_loss and self.task_obj_predict:
+            obj_labels["attr"] = (
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_attr_labels),
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_attr_labels),
+            )
+        if self.visual_feat_loss and self.task_obj_predict:
+            obj_labels["feat"] = (
+                ids_tensor(
+                    [self.batch_size, self.num_visual_features, self.visual_feat_dim], self.num_visual_features
+                ),
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_visual_features),
+            )
+        if self.visual_obj_loss and self.task_obj_predict:
+            obj_labels["obj"] = (
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_object_labels),
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_object_labels),
+            )
+        ans = None
+        if self.task_qa:
+            ans = ids_tensor([self.batch_size], self.num_qa_labels)
+        masked_lm_labels = None
+        if self.task_mask_lm:
+            masked_lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        matched_label = None
+        if self.task_matched:
+            matched_label = ids_tensor([self.batch_size], self.num_labels)
+
+        config = LxmertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            num_labels=self.num_labels,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            layer_norm_eps=self.layer_norm_eps,
+            pad_token_id=self.pad_token_id,
+            num_qa_labels=self.num_qa_labels,
+            num_object_labels=self.num_object_labels,
+            num_attr_labels=self.num_attr_labels,
+            l_layers=self.l_layers,
+            x_layers=self.x_layers,
+            r_layers=self.r_layers,
+            visual_feat_dim=self.visual_feat_dim,
+            visual_pos_dim=self.visual_pos_dim,
+            visual_loss_normalizer=self.visual_loss_normalizer,
+            task_matched=self.task_matched,
+            task_mask_lm=self.task_mask_lm,
+            task_obj_predict=self.task_obj_predict,
+            task_qa=self.task_qa,
+            visual_obj_loss=self.visual_obj_loss,
+            visual_attr_loss=self.visual_attr_loss,
+            visual_feat_loss=self.visual_feat_loss,
+            output_attentions=self.output_attentions,
+            output_hidden_states=self.output_hidden_states,
+        )
+
+        return (
+            config,
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids,
+            input_mask,
+            obj_labels,
+            masked_lm_labels,
+            matched_label,
+            ans,
+            output_attentions,
+        )
+
+    def create_and_check_lxmert_model(
+        self,
+        config,
+        input_ids,
+        visual_feats,
+        bounding_boxes,
+        token_type_ids,
+        input_mask,
+        obj_labels,
+        masked_lm_labels,
+        matched_label,
+        ans,
+        output_attentions,
+    ):
+        model = LxmertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            output_attentions=output_attentions,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            output_attentions=not output_attentions,
+        )
+        result = model(input_ids, visual_feats, bounding_boxes, return_dict=False)
+        result = model(input_ids, visual_feats, bounding_boxes, return_dict=True)
+
+        self.parent.assertEqual(result.language_output.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(
+            result.vision_output.shape, (self.batch_size, self.num_visual_features, self.hidden_size)
+        )
+        self.parent.assertEqual(result.pooled_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_lxmert_for_question_answering(
+        self,
+        config,
+        input_ids,
+        visual_feats,
+        bounding_boxes,
+        token_type_ids,
+        input_mask,
+        obj_labels,
+        masked_lm_labels,
+        matched_label,
+        ans,
+        output_attentions,
+    ):
+        model = LxmertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            labels=ans,
+            output_attentions=output_attentions,
+        )
+        result = model(input_ids, visual_feats, bounding_boxes, labels=ans)
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            labels=ans,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            output_attentions=output_attentions,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            labels=ans,
+            output_attentions=not output_attentions,
+        )
+
+        self.parent.assertEqual(result.question_answering_score.shape, (self.batch_size, self.num_qa_labels))
+
+    def create_and_check_lxmert_for_pretraining(
+        self,
+        config,
+        input_ids,
+        visual_feats,
+        bounding_boxes,
+        token_type_ids,
+        input_mask,
+        obj_labels,
+        masked_lm_labels,
+        matched_label,
+        ans,
+        output_attentions,
+    ):
+        model = LxmertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+            obj_labels=obj_labels,
+            matched_label=matched_label,
+            ans=ans,
+            output_attentions=output_attentions,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+            output_attentions=not output_attentions,
+            return_dict=False,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            obj_labels=obj_labels,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            matched_label=matched_label,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            ans=ans,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+            obj_labels=obj_labels,
+            matched_label=matched_label,
+            ans=ans,
+            output_attentions=not output_attentions,
+        )
+
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def resize_lxmert_num_qa_labels(
+        self,
+        config,
+        input_ids,
+        visual_feats,
+        bounding_boxes,
+        token_type_ids,
+        input_mask,
+        obj_labels,
+        masked_lm_labels,
+        matched_label,
+        ans,
+        output_attentions,
+    ):
+
+        start_labels = config.num_qa_labels
+        num_large_labels = config.num_qa_labels * 2
+        num_small_labels = int(config.num_qa_labels * 2)
+        less_labels_ans = ids_tensor([self.batch_size], num_small_labels)
+        more_labels_ans = ids_tensor([self.batch_size], num_large_labels)
+        model_pretrain = LxmertForPreTraining(config=config).to(torch_device)
+        model_qa = LxmertForQuestionAnswering(config=config).to(torch_device)
+        config.num_labels = num_small_labels
+        end_labels = config.num_labels
+
+        result_pretrain = model_pretrain(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            ans=ans,
+        )
+
+        result_qa = model_qa(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            labels=ans,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+        )
+
+        model_pretrain.resize_num_qa_labels(num_small_labels)
+        model_qa.resize_num_qa_labels(num_small_labels)
+
+        result_pretrain_less = model_pretrain(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            ans=less_labels_ans,
+        )
+
+        result_qa_less = model_qa(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            labels=less_labels_ans,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+        )
+
+        model_pretrain.resize_num_qa_labels(num_large_labels)
+        model_qa.resize_num_qa_labels(num_large_labels)
+
+        result_pretrain_more = model_pretrain(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            ans=more_labels_ans,
+        )
+
+        result_qa_more = model_qa(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            labels=more_labels_ans,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+        )
+
+        model_qa_labels = model_qa.num_qa_labels
+
+        self.parent.assertNotEqual(start_labels, end_labels)
+        self.parent.assertNotEqual(model_qa_labels, start_labels)
+        self.parent.assertEqual(result_qa.question_answering_score.shape, (self.batch_size, start_labels))
+        self.parent.assertEqual(result_pretrain.question_answering_score.shape, (self.batch_size, start_labels))
+        self.parent.assertEqual(result_qa_less.question_answering_score.shape, (self.batch_size, num_small_labels))
+        self.parent.assertEqual(
+            result_pretrain_less.question_answering_score.shape, (self.batch_size, num_small_labels)
+        )
+        self.parent.assertEqual(result_qa_more.question_answering_score.shape, (self.batch_size, num_large_labels))
+        self.parent.assertEqual(
+            result_pretrain_more.question_answering_score.shape, (self.batch_size, num_large_labels)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids,
+            input_mask,
+            obj_labels,
+            masked_lm_labels,
+            matched_label,
+            ans,
+            output_attentions,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "visual_feats": visual_feats,
+            "visual_pos": bounding_boxes,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+
+        return config, inputs_dict
+
+
+@require_torch
+class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (LxmertModel, LxmertForPreTraining, LxmertForQuestionAnswering) if is_torch_available() else ()
+
+    test_head_masking = False
+    test_pruning = False
+    test_torchscript = False
+
+    # overwrite function because qa models takes different input label shape
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                # special case for models like BERT that use multi-loss training for PreTraining
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = LxmertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LxmertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_lxmert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lxmert_model(*config_and_inputs)
+
+    def test_lxmert_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lxmert_for_question_answering(*config_and_inputs)
+
+    def test_lxmert_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lxmert_for_pretraining(*config_and_inputs)
+
+    def test_lxmert_question_answering_labels_resize(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.resize_lxmert_num_qa_labels(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = LxmertModel.from_pretrained(model_name)
+            model.to(torch_device)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
+
+            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
+            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
+            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
+            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
+
+            attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
+            attention_shapes = [
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_visual_features,
+                    self.model_tester.num_visual_features,
+                ],
+                [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
+            ]
+
+            for attention, attention_shape in zip(attentions, attention_shapes):
+                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # 2 hidden states were added
+            self.assertEqual(out_len + 2, len(outputs))
+
+            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
+            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
+            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
+
+            attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
+            attention_shapes = [
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_visual_features,
+                    self.model_tester.num_visual_features,
+                ],
+                [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
+            ]
+
+            for attention, attention_shape in zip(attentions, attention_shapes):
+                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1]
+
+            self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
+            self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
+
+            seq_length = self.model_tester.seq_length
+            num_visual_features = self.model_tester.num_visual_features
+
+            self.assertListEqual(
+                list(language_hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+            self.assertListEqual(
+                list(vision_hidden_states[0].shape[-2:]),
+                [num_visual_features, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+
+        hidden_states_lang = outputs.language_hidden_states[0]
+        attentions_lang = outputs.language_attentions[0]
+
+        hidden_states_vision = outputs.vision_hidden_states[0]
+        attentions_vision = outputs.vision_attentions[0]
+
+        hidden_states_lang.retain_grad()
+        attentions_lang.retain_grad()
+        hidden_states_vision.retain_grad()
+        attentions_vision.retain_grad()
+
+        outputs.language_output.flatten()[0].backward(retain_graph=True)
+        outputs.vision_output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states_lang.grad)
+        self.assertIsNotNone(attentions_vision.grad)
+        self.assertIsNotNone(hidden_states_vision.grad)
+        self.assertIsNotNone(attentions_vision.grad)
diff --git a/test_modeling_m2m_100.py b/test_modeling_m2m_100.py
new file mode 100644
index 0000000000000000000000000000000000000000..e39876e4ee7cecd996470b3c2ecb2e7130ce7b19
--- /dev/null
+++ b/test_modeling_m2m_100.py
@@ -0,0 +1,377 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch M2M100 model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Model, M2M100Tokenizer
+    from transformers.models.m2m_100.modeling_m2m_100 import M2M100Decoder, M2M100Encoder
+
+
+def prepare_m2m_100_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class M2M100ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="relu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        # we need to clamp the input ids here to avoid having pad token in between
+        # this is because for M2M100 the position_ids are prepared such that
+        # all pad tokens have pos id = 2 and rest are between 2..seq_length
+        # and the seq_length here is seq_length - num_pad_tokens
+        # but when using past, there is no way of knowing if the past input ids had
+        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
+        # position_ids being off by num_pad_tokens in past input
+        input_ids = input_ids.clamp(self.pad_token_id + 1)
+        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
+
+        config = M2M100Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            encoder_layerdrop=self.encoder_layerdrop,
+            decoder_layerdrop=self.decoder_layerdrop,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_m2m_100_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = M2M100Model(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = M2M100Model(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = M2M100Encoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = M2M100Decoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            M2M100Model,
+            M2M100ForConditionalGeneration,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (M2M100ForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = M2M100ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=M2M100Config)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (M2M100Model, M2M100ForConditionalGeneration):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = M2M100ForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class M2M100ModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_tokenizer(self):
+        return M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+    def test_inference_no_head(self):
+        model = M2M100Model.from_pretrained("facebook/m2m100_418M").to(torch_device)
+        input_ids = _long_tensor([[128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38, 2]])
+        decoder_input_ids = _long_tensor([[2, 128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38]])
+        inputs_dict = prepare_m2m_100_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[-0.7780, -0.1676, 0.1038], [-6.7556, -1.3992, 0.0567], [-7.5383, -0.5920, -0.2779]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(torch_device)
+
+        # change to intended input
+        input_ids = _long_tensor([[128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38, 2]])
+        decoder_input_ids = _long_tensor([[2, 128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38]])
+        inputs_dict = prepare_m2m_100_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, model.config.vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[-1.0448, -1.0411, 3.7992], [-3.2191, -3.2386, -1.3451], [-3.6210, -3.5993, 0.4925]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(torch_device)
+        tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="fr", tgt_lang="en")
+
+        src_fr = [
+            "L'affaire NSA souligne l'absence totale de débat sur le renseignement",
+            "Selon moi, il y a deux niveaux de réponse de la part du gouvernement français.",
+            "Lorsque François Hollande téléphone à Barack Obama ou quand le ministre des affaires étrangères Laurent Fabius convoque l'ambassadeur des Etats-Unis, ils réagissent à une vraie découverte, qui est celle de l'ampleur de la surveillance américaine sur l'ensemble des communications en France.",
+        ]
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        dct = tokenizer(src_fr, padding=True, return_tensors="pt")
+
+        hypotheses_batch = model.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=5,
+            forced_bos_token_id=tokenizer.get_lang_id("en"),
+        )
+
+        expected_en = [
+            "The NSA case highlights the total absence of intelligence debate",
+            "I think there are two levels of response from the French government.",
+            "When François Hollande calls Barack Obama or when Foreign Minister Laurent Fabius calls the U.S. Ambassador, they respond to a real discovery, which is that of the scale of U.S. surveillance on all communications in France.",
+        ]
+
+        generated = tokenizer.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated == expected_en
diff --git a/test_modeling_marian.py b/test_modeling_marian.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b6cb153065b3fa002e8a7e3f8a0785ca691cf6f
--- /dev/null
+++ b/test_modeling_marian.py
@@ -0,0 +1,771 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Marian model. """
+
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.hf_api import HfApi
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoConfig,
+        AutoModelWithLMHead,
+        AutoTokenizer,
+        MarianConfig,
+        MarianModel,
+        MarianMTModel,
+        TranslationPipeline,
+    )
+    from transformers.models.marian.convert_marian_to_pytorch import (
+        ORG_NAME,
+        convert_hf_name_to_opus_name,
+        convert_opus_name_to_hf_name,
+    )
+    from transformers.models.marian.modeling_marian import (
+        MarianDecoder,
+        MarianEncoder,
+        MarianForCausalLM,
+        shift_tokens_right,
+    )
+
+
+def prepare_marian_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class MarianModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        decoder_start_token_id=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = MarianConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = MarianModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = MarianModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = MarianEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = MarianDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (MarianModel, MarianMTModel) if is_torch_available() else ()
+    all_generative_model_classes = (MarianMTModel,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = MarianModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MarianConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = MarianMTModel(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+class ModelManagementTests(unittest.TestCase):
+    @slow
+    @require_torch
+    def test_model_names(self):
+        model_list = HfApi().model_list()
+        model_ids = [x.modelId for x in model_list if x.modelId.startswith(ORG_NAME)]
+        bad_model_ids = [mid for mid in model_ids if "+" in model_ids]
+        self.assertListEqual([], bad_model_ids)
+        self.assertGreater(len(model_ids), 500)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MarianIntegrationTest(unittest.TestCase):
+    src = "en"
+    tgt = "de"
+    src_text = [
+        "I am a small frog.",
+        "Now I can forget the 100 words of german that I know.",
+        "Tom asked his teacher for advice.",
+        "That's how I would do it.",
+        "Tom really admired Mary's courage.",
+        "Turn around and close your eyes.",
+    ]
+    expected_text = [
+        "Ich bin ein kleiner Frosch.",
+        "Jetzt kann ich die 100 Wörter des Deutschen vergessen, die ich kenne.",
+        "Tom bat seinen Lehrer um Rat.",
+        "So würde ich das machen.",
+        "Tom bewunderte Marias Mut wirklich.",
+        "Drehen Sie sich um und schließen Sie die Augen.",
+    ]
+    # ^^ actual C++ output differs slightly: (1) des Deutschen removed, (2) ""-> "O", (3) tun -> machen
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
+        return cls
+
+    @cached_property
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.model_name)
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @cached_property
+    def model(self):
+        model: MarianMTModel = AutoModelWithLMHead.from_pretrained(self.model_name).to(torch_device)
+        c = model.config
+        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
+        self.assertEqual(c.max_length, 512)
+        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
+
+        if torch_device == "cuda":
+            return model.half()
+        else:
+            return model
+
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        self.assertListEqual(self.expected_text, generated_words)
+
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer(self.src_text, padding=True, return_tensors="pt", **tokenizer_kwargs).to(
+            torch_device
+        )
+        self.assertEqual(self.model.device, model_inputs.input_ids.device)
+        generated_ids = self.model.generate(
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        return generated_words
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_EN_DE_More(MarianIntegrationTest):
+    @slow
+    def test_forward(self):
+        src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
+        expected_ids = [38, 121, 14, 697, 38848, 0]
+
+        model_inputs = self.tokenizer(src, return_tensors="pt").to(torch_device)
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(tgt, return_tensors="pt")
+        model_inputs["labels"] = targets["input_ids"].to(torch_device)
+
+        self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
+
+        desired_keys = {
+            "input_ids",
+            "attention_mask",
+            "labels",
+        }
+        self.assertSetEqual(desired_keys, set(model_inputs.keys()))
+        model_inputs["decoder_input_ids"] = shift_tokens_right(
+            model_inputs.labels, self.tokenizer.pad_token_id, self.model.config.decoder_start_token_id
+        )
+        model_inputs["return_dict"] = True
+        model_inputs["use_cache"] = False
+        with torch.no_grad():
+            outputs = self.model(**model_inputs)
+        max_indices = outputs.logits.argmax(-1)
+        self.tokenizer.batch_decode(max_indices)
+
+    def test_unk_support(self):
+        t = self.tokenizer
+        ids = t(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist()
+        expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
+        self.assertEqual(expected, ids)
+
+    def test_pad_not_split(self):
+        input_ids_w_pad = self.tokenizer(["I am a small frog <pad>"], return_tensors="pt").input_ids[0].tolist()
+        expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0]  # pad
+        self.assertListEqual(expected_w_pad, input_ids_w_pad)
+
+    @slow
+    def test_batch_generation_en_de(self):
+        self._assert_generated_batch_equal_expected()
+
+    def test_auto_config(self):
+        config = AutoConfig.from_pretrained(self.model_name)
+        self.assertIsInstance(config, MarianConfig)
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_EN_FR(MarianIntegrationTest):
+    src = "en"
+    tgt = "fr"
+    src_text = [
+        "I am a small frog.",
+        "Now I can forget the 100 words of german that I know.",
+    ]
+    expected_text = [
+        "Je suis une petite grenouille.",
+        "Maintenant, je peux oublier les 100 mots d'allemand que je connais.",
+    ]
+
+    @slow
+    def test_batch_generation_en_fr(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_FR_EN(MarianIntegrationTest):
+    src = "fr"
+    tgt = "en"
+    src_text = [
+        "Donnez moi le micro.",
+        "Tom et Mary étaient assis à une table.",  # Accents
+    ]
+    expected_text = [
+        "Give me the microphone.",
+        "Tom and Mary were sitting at a table.",
+    ]
+
+    @slow
+    def test_batch_generation_fr_en(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_RU_FR(MarianIntegrationTest):
+    src = "ru"
+    tgt = "fr"
+    src_text = ["Он показал мне рукопись своей новой пьесы."]
+    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]
+
+    @slow
+    def test_batch_generation_ru_fr(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_MT_EN(MarianIntegrationTest):
+    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
+
+    src = "mt"
+    tgt = "en"
+    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
+    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
+
+    @slow
+    def test_batch_generation_mt_en(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_en_zh(MarianIntegrationTest):
+    src = "en"
+    tgt = "zh"
+    src_text = ["My name is Wolfgang and I live in Berlin"]
+    expected_text = ["我叫沃尔夫冈 我住在柏林"]
+
+    @slow
+    def test_batch_generation_eng_zho(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_en_ROMANCE(MarianIntegrationTest):
+    """Multilingual on target side."""
+
+    src = "en"
+    tgt = "ROMANCE"
+    src_text = [
+        ">>fr<< Don't spend so much time watching TV.",
+        ">>pt<< Your message has been sent.",
+        ">>es<< He's two years older than me.",
+    ]
+    expected_text = [
+        "Ne passez pas autant de temps à regarder la télé.",
+        "A sua mensagem foi enviada.",
+        "Es dos años más viejo que yo.",
+    ]
+
+    @slow
+    def test_batch_generation_en_ROMANCE_multi(self):
+        self._assert_generated_batch_equal_expected()
+
+    @slow
+    def test_pipeline(self):
+        device = 0 if torch_device == "cuda" else -1
+        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="pt", device=device)
+        output = pipeline(self.src_text)
+        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
+
+
+@require_torch
+class TestConversionUtils(unittest.TestCase):
+    def test_renaming_multilingual(self):
+        old_names = [
+            "opus-mt-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
+            "opus-mt-cmn+cn-fi",  # no group
+            "opus-mt-en-de",  # standard name
+            "opus-mt-en-de",  # standard name
+        ]
+        expected = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
+        self.assertListEqual(expected, [convert_opus_name_to_hf_name(x) for x in old_names])
+
+    def test_undoing_renaming(self):
+        hf_names = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
+        converted_opus_names = [convert_hf_name_to_opus_name(x) for x in hf_names]
+        expected_opus_names = [
+            "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
+            "cmn+cn-fi",
+            "en-de",  # standard name
+            "en-de",
+        ]
+        self.assertListEqual(expected_opus_names, converted_opus_names)
+
+
+class MarianStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = MarianConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = MarianDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = MarianDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class MarianStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (MarianDecoder, MarianForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (MarianForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = MarianStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=MarianConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_mbart.py b/test_modeling_mbart.py
new file mode 100644
index 0000000000000000000000000000000000000000..40fc6fbcd8ad5a24ef4e452c93e569158f682a26
--- /dev/null
+++ b/test_modeling_mbart.py
@@ -0,0 +1,656 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MBART model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoTokenizer,
+        BatchEncoding,
+        MBartConfig,
+        MBartForCausalLM,
+        MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
+        MBartModel,
+    )
+    from transformers.models.mbart.modeling_mbart import MBartDecoder, MBartEncoder
+
+
+def prepare_mbart_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class MBartModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = MBartConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = MBartModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = MBartModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = MBartEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = MBartDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (MBartForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = MBartModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MBartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # MBartForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (MBartModel, MBartForConditionalGeneration, MBartForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = MBartForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
+    maxDiff = 1000  # longer string compare tracebacks
+    checkpoint_name = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
+        return cls
+
+    @cached_property
+    def model(self):
+        """Only load the model if needed."""
+        model = MBartForConditionalGeneration.from_pretrained(self.checkpoint_name).to(torch_device)
+        if "cuda" in torch_device:
+            model = model.half()
+        return model
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "facebook/mbart-large-en-ro"
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
+    ]
+    tgt_text = [
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
+        'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţa şi mizeria a milioane de oameni.',
+    ]
+    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, 250004]
+
+    @slow
+    def test_enro_generate_one(self):
+        batch: BatchEncoding = self.tokenizer(
+            ["UN Chief Says There Is No Military Solution in Syria"], return_tensors="pt"
+        ).to(torch_device)
+        translated_tokens = self.model.generate(**batch)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        self.assertEqual(self.tgt_text[0], decoded[0])
+        # self.assertEqual(self.tgt_text[1], decoded[1])
+
+    @slow
+    def test_enro_generate_batch(self):
+        batch: BatchEncoding = self.tokenizer(self.src_text, return_tensors="pt", padding=True, truncation=True).to(
+            torch_device
+        )
+        translated_tokens = self.model.generate(**batch)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        assert self.tgt_text == decoded
+
+    def test_mbart_enro_config(self):
+        mbart_models = ["facebook/mbart-large-en-ro"]
+        expected = {"scale_embedding": True, "output_past": True}
+        for name in mbart_models:
+            config = MBartConfig.from_pretrained(name)
+            for k, v in expected.items():
+                try:
+                    self.assertEqual(v, getattr(config, k))
+                except AssertionError as e:
+                    e.args += (name, k)
+                    raise
+
+    def test_mbart_fast_forward(self):
+        config = MBartConfig(
+            vocab_size=99,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            add_final_layer_norm=True,
+        )
+        lm_model = MBartForConditionalGeneration(config).to(torch_device)
+        context = torch.tensor(
+            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], device=torch_device, dtype=torch.long
+        )
+        summary = torch.tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], device=torch_device, dtype=torch.long)
+        result = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
+        expected_shape = (*summary.shape, config.vocab_size)
+        self.assertEqual(result.logits.shape, expected_shape)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "facebook/mbart-large-cc25"
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+        " I ate lunch twice yesterday",
+    ]
+    tgt_text = ["Şeful ONU declară că nu există o soluţie militară în Siria", "to be padded"]
+
+    @unittest.skip("This test is broken, still generates english")
+    def test_cc25_generate(self):
+        inputs = self.tokenizer([self.src_text[0]], return_tensors="pt").to(torch_device)
+        translated_tokens = self.model.generate(
+            input_ids=inputs["input_ids"].to(torch_device),
+            decoder_start_token_id=self.tokenizer.lang_code_to_id["ro_RO"],
+        )
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        self.assertEqual(self.tgt_text[0], decoded[0])
+
+    @slow
+    def test_fill_mask(self):
+        inputs = self.tokenizer(["One of the best <mask> I ever read!"], return_tensors="pt").to(torch_device)
+        outputs = self.model.generate(
+            inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1
+        )
+        prediction: str = self.tokenizer.batch_decode(
+            outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )[0]
+        self.assertEqual(prediction, "of the best books I ever read!")
+
+
+class MBartStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = MBartConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = MBartDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = MBartDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (MBartDecoder, MBartForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (MBartForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = MBartStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=MBartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_megatron_bert.py b/test_modeling_megatron_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a58e9f753d1dc0b4c4f754615aaac5492deec0a
--- /dev/null
+++ b/test_modeling_megatron_bert.py
@@ -0,0 +1,378 @@
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MegatronBERT model. """
+
+
+import math
+import os
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MegatronBertConfig,
+        MegatronBertForCausalLM,
+        MegatronBertForMaskedLM,
+        MegatronBertForMultipleChoice,
+        MegatronBertForNextSentencePrediction,
+        MegatronBertForPreTraining,
+        MegatronBertForQuestionAnswering,
+        MegatronBertForSequenceClassification,
+        MegatronBertForTokenClassification,
+        MegatronBertModel,
+    )
+
+
+class MegatronBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        embedding_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MegatronBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            embedding_size=self.embedding_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_megatron_bert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_megatron_bert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_causal_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_megatron_bert_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForNextSentencePrediction(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=sequence_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+    def create_and_check_megatron_bert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            next_sentence_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+    def create_and_check_megatron_bert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_megatron_bert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegatronBertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_megatron_bert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegatronBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_megatron_bert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MegatronBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            MegatronBertModel,
+            MegatronBertForMaskedLM,
+            MegatronBertForCausalLM,
+            MegatronBertForMultipleChoice,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_ready_model_classes = all_model_classes
+
+    # test_resize_embeddings = False
+    test_head_masking = False
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = MegatronBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_megatron_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_token_classification(*config_and_inputs)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(
+        tok_lst,
+        dtype=torch.long,
+        device=torch_device,
+    )
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MegatronBertModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        directory = "nvidia/megatron-bert-uncased-345m"
+        if "MYDIR" in os.environ:
+            directory = os.path.join(os.environ["MYDIR"], directory)
+        model = MegatronBertModel.from_pretrained(directory)
+        model.to(torch_device)
+        model.half()
+        input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 9, 1024))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected = [-0.6040, -0.2517, -0.1025, 0.3420, -0.6758, -0.0017, -0.1089, -0.1990, 0.5728]
+        for ii in range(3):
+            for jj in range(3):
+                a = output[0, ii, jj]
+                b = expected[3 * ii + jj]
+                msg = "ii={} jj={} a={} b={}".format(ii, jj, a, b)
+                self.assertTrue(math.isclose(a, b, rel_tol=TOLERANCE, abs_tol=TOLERANCE), msg=msg)
diff --git a/test_modeling_megatron_gpt2.py b/test_modeling_megatron_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f7c472e391c82e83e2703bdcfd387580828a54
--- /dev/null
+++ b/test_modeling_megatron_gpt2.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import GPT2LMHeadModel
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MegatronGPT2IntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        directory = "nvidia/megatron-gpt2-345m/"
+        if "MYDIR" in os.environ:
+            directory = os.path.join(os.environ["MYDIR"], directory)
+        model = GPT2LMHeadModel.from_pretrained(directory)
+        model.to(torch_device)
+        model.half()
+
+        input_ids = torch.tensor(
+            [[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]],
+            device=torch_device,
+            dtype=torch.long,
+        )
+
+        with torch.no_grad():
+            output = model(input_ids).logits
+
+        expected_shape = torch.Size((1, 9, 50257))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_diag = torch.tensor(
+            [
+                4.9414,
+                -0.2920,
+                -1.2148,
+                -4.0273,
+                -0.5161,
+                -5.2109,
+                -1.2412,
+                -1.8301,
+                -1.7734,
+                -4.7148,
+                -0.2317,
+                -1.0811,
+                -2.1777,
+                0.4141,
+                -3.7969,
+                -4.0586,
+                -2.5332,
+                -3.3809,
+                4.3867,
+            ],
+            device=torch_device,
+            dtype=torch.half,
+        )
+
+        for i in range(19):
+            r, c = 8 * i // 17, 2792 * i  # along the diagonal
+            computed, expected = output[0, r, c], expected_diag[i]
+            msg = f"row={r} col={c} computed={computed} expected={expected}"
+            self.assertAlmostEqual(computed, expected, delta=1e-4, msg=msg)
diff --git a/test_modeling_mobilebert.py b/test_modeling_mobilebert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebc770252befb987fac931da7c01a0deb05297c
--- /dev/null
+++ b/test_modeling_mobilebert.py
@@ -0,0 +1,368 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MobileBertConfig,
+        MobileBertForMaskedLM,
+        MobileBertForMultipleChoice,
+        MobileBertForNextSentencePrediction,
+        MobileBertForPreTraining,
+        MobileBertForQuestionAnswering,
+        MobileBertForSequenceClassification,
+        MobileBertForTokenClassification,
+        MobileBertModel,
+    )
+
+
+class MobileBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        embedding_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MobileBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            embedding_size=self.embedding_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_mobilebert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MobileBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_mobilebert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MobileBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_mobilebert_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MobileBertForNextSentencePrediction(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=sequence_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+    def create_and_check_mobilebert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MobileBertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            next_sentence_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+    def create_and_check_mobilebert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MobileBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_mobilebert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MobileBertForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_mobilebert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MobileBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_mobilebert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MobileBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            MobileBertModel,
+            MobileBertForMaskedLM,
+            MobileBertForMultipleChoice,
+            MobileBertForNextSentencePrediction,
+            MobileBertForPreTraining,
+            MobileBertForQuestionAnswering,
+            MobileBertForSequenceClassification,
+            MobileBertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_ready_model_classes = all_model_classes
+    test_sequence_classification_problem_types = True
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = MobileBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mobilebert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_token_classification(*config_and_inputs)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(
+        tok_lst,
+        dtype=torch.long,
+        device=torch_device,
+    )
+
+
+TOLERANCE = 1e-3
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MobileBertModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = MobileBertModel.from_pretrained("google/mobilebert-uncased").to(torch_device)
+        input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 9, 512))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-2.4736526e07, 8.2691656e04, 1.6521838e05],
+                    [-5.7541704e-01, 3.9056022e00, 4.4011507e00],
+                    [2.6047359e00, 1.5677652e00, -1.7324188e-01],
+                ]
+            ],
+            device=torch_device,
+        )
+
+        # MobileBERT results range from 10e0 to 10e8. Even a 0.0000001% difference with a value of 10e8 results in a
+        # ~1 difference, it's therefore not a good idea to measure using addition.
+        # Here, we instead divide the expected result with the result in order to obtain ~1. We then check that the
+        # result is held between bounds: 1 - TOLERANCE < expected_result / result < 1 + TOLERANCE
+        lower_bound = torch.all((expected_slice / output[..., :3, :3]) >= 1 - TOLERANCE)
+        upper_bound = torch.all((expected_slice / output[..., :3, :3]) <= 1 + TOLERANCE)
+
+        self.assertTrue(lower_bound and upper_bound)
diff --git a/test_modeling_mpnet.py b/test_modeling_mpnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d63824c4512231427816ea0de7263e49207db96
--- /dev/null
+++ b/test_modeling_mpnet.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, Microsoft Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MPNetConfig,
+        MPNetForMaskedLM,
+        MPNetForMultipleChoice,
+        MPNetForQuestionAnswering,
+        MPNetForSequenceClassification,
+        MPNetForTokenClassification,
+        MPNetModel,
+    )
+
+
+class MPNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=64,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def get_large_model_config(self):
+        return MPNetConfig.from_pretrained("microsoft/mpnet-base")
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MPNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_mpnet_model(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MPNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_mpnet_for_question_answering(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MPNetForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_mpnet_for_sequence_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MPNetForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_mpnet_for_multiple_choice(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MPNetForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_mpnet_for_token_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MPNetForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MPNetModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            MPNetForMaskedLM,
+            MPNetForMultipleChoice,
+            MPNetForQuestionAnswering,
+            MPNetForSequenceClassification,
+            MPNetForTokenClassification,
+            MPNetModel,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_torchscript = True
+    test_resize_embeddings = True
+
+    def setUp(self):
+        self.model_tester = MPNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mpnet_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_model(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_sequence_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_multiple_choice(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_token_classification(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs)
+
+
+@require_torch
+class MPNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = MPNetModel.from_pretrained("microsoft/mpnet-base")
+        input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 768))
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-0.0550, 0.1943, -0.0740], [-0.0562, 0.2211, -0.0579], [-0.0437, 0.3337, -0.0641]]]
+        )
+        # compare the actual values for a slice.
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_mt5.py b/test_modeling_mt5.py
new file mode 100644
index 0000000000000000000000000000000000000000..6931f9d80009820b4ec8976f5e3e0697cb5f31f8
--- /dev/null
+++ b/test_modeling_mt5.py
@@ -0,0 +1,53 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+
+if is_torch_available():
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MT5IntegrationTest(unittest.TestCase):
+    @slow
+    def test_small_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+
+        >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small", return_dict=True).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
+        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
+
+        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
+        mtf_score = -(labels.shape[-1] * loss.item())
+
+        EXPECTED_SCORE = -84.9127
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
diff --git a/test_modeling_openai.py b/test_modeling_openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ee51df3f6b81f9f65200dccfbc252e39db22b8
--- /dev/null
+++ b/test_modeling_openai.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        OpenAIGPTConfig,
+        OpenAIGPTDoubleHeadsModel,
+        OpenAIGPTForSequenceClassification,
+        OpenAIGPTLMHeadModel,
+        OpenAIGPTModel,
+    )
+
+
+class OpenAIGPTModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = OpenAIGPTConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+            pad_token_id=self.pad_token_id,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        model = OpenAIGPTModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        model = OpenAIGPTLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        model = OpenAIGPTDoubleHeadsModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_openai_gpt_for_sequence_classification(
+        self, config, input_ids, head_mask, token_type_ids, *args
+    ):
+        config.num_labels = self.num_labels
+        model = OpenAIGPTForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        # print(config.num_labels, sequence_labels.size())
+        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            head_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "head_mask": head_mask,
+        }
+
+        return config, inputs_dict
+
+
+@require_torch
+class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTForSequenceClassification)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (OpenAIGPTLMHeadModel,) if is_torch_available() else ()
+    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
+
+    # special case for DoubleHeads model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "OpenAIGPTDoubleHeadsModel":
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["input_ids"] = inputs_dict["labels"]
+                inputs_dict["token_type_ids"] = inputs_dict["labels"]
+                inputs_dict["mc_token_ids"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.num_choices),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["mc_labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = OpenAIGPTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_openai_gpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
+
+    def test_openai_gpt_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+    def test_openai_gpt_double_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
+
+    def test_openai_gpt_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_for_sequence_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = OpenAIGPTModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class OPENAIGPTModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_openai_gpt(self):
+        model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
+        model.to(torch_device)
+        input_ids = torch.tensor([[481, 4735, 544]], dtype=torch.long, device=torch_device)  # the president is
+        expected_output_ids = [
+            481,
+            4735,
+            544,
+            246,
+            963,
+            870,
+            762,
+            239,
+            244,
+            40477,
+            244,
+            249,
+            719,
+            881,
+            487,
+            544,
+            240,
+            244,
+            603,
+            481,
+        ]  # the president is a very good man. " \n " i\'m sure he is, " said the
+
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/test_modeling_pegasus.py b/test_modeling_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..4106793332d6829d3be7f72a9c622b153b8be6a6
--- /dev/null
+++ b/test_modeling_pegasus.py
@@ -0,0 +1,531 @@
+# coding=utf-8
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch PEGASUS model. """
+
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+from .test_modeling_mbart import AbstractSeq2SeqIntegrationTest
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModelForSeq2SeqLM, PegasusConfig, PegasusForConditionalGeneration, PegasusModel
+    from transformers.models.pegasus.modeling_pegasus import PegasusDecoder, PegasusEncoder, PegasusForCausalLM
+
+
+def prepare_pegasus_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class PegasusModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = PegasusConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = PegasusModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+        head_mask = inputs_dict["head_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = PegasusModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = PegasusEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = PegasusDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (PegasusModel, PegasusForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (PegasusForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = PegasusModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = PegasusForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "google/pegasus-xsum"
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
+        """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
+    ]
+
+    tgt_text = [
+        "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
+        "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
+    ]
+
+    @cached_property
+    def model(self):
+        return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
+
+    @slow
+    def test_pegasus_xsum_summary(self):
+        assert self.tokenizer.model_max_length == 512
+        inputs = self.tokenizer(self.src_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
+            torch_device
+        )
+        assert inputs.input_ids.shape == (2, 421)
+        translated_tokens = self.model.generate(**inputs, num_beams=2)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        assert self.tgt_text == decoded
+
+        if "cuda" not in torch_device:
+            return
+        # Demonstrate fp16 issue, Contributions welcome!
+        self.model.half()
+        translated_tokens_fp16 = self.model.generate(**inputs, max_length=10)
+        decoded_fp16 = self.tokenizer.batch_decode(translated_tokens_fp16, skip_special_tokens=True)
+        assert decoded_fp16 == [
+            "California's largest electricity provider has begun",
+            "N-Dubz have revealed they were",
+        ]
+
+
+class PegasusStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        d_model=16,
+        decoder_seq_length=7,
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=2,
+        decoder_ffn_dim=32,
+        decoder_layers=4,
+        encoder_attention_heads=4,
+        decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.hidden_size = d_model
+        self.num_hidden_layers = decoder_layers
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.num_attention_heads = decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = PegasusConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.d_model,
+            decoder_layers=self.decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_attention_heads=self.encoder_attention_heads,
+            decoder_attention_heads=self.decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = PegasusDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = PegasusDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class PegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (PegasusDecoder, PegasusForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (PegasusForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    is_encoder_decoder = False
+
+    def setUp(
+        self,
+    ):
+        self.model_tester = PegasusStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
diff --git a/test_modeling_prophetnet.py b/test_modeling_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f1000444688f73ad5a58d3efdfc8ccc20a75a2
--- /dev/null
+++ b/test_modeling_prophetnet.py
@@ -0,0 +1,1291 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        ProphetNetConfig,
+        ProphetNetDecoder,
+        ProphetNetEncoder,
+        ProphetNetForCausalLM,
+        ProphetNetForConditionalGeneration,
+        ProphetNetModel,
+        ProphetNetTokenizer,
+    )
+    from transformers.modeling_outputs import BaseModelOutput
+
+
+class ProphetNetModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        hidden_size=16,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=True,
+        decoder_start_token_id=0,
+        encoder_ffn_dim=32,
+        num_encoder_layers=4,
+        num_encoder_attention_heads=4,
+        decoder_ffn_dim=32,
+        num_decoder_layers=4,
+        num_decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        ngram=2,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        scope=None,
+    ):
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_decoder_layers
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_attention_heads = num_decoder_attention_heads
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.disable_ngram_loss = disable_ngram_loss
+        self.max_position_embeddings = max_position_embeddings
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 7
+        self.num_hidden_states_types = 3  # encoder, decoder_main, decoder_ngram
+        self.decoder_attention_idx = 2
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = ProphetNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_encoder_layers=self.num_encoder_layers,
+            num_decoder_layers=self.num_decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            num_encoder_attention_heads=self.num_encoder_attention_heads,
+            num_decoder_attention_heads=self.num_decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            ngram=self.ngram,
+            num_buckets=self.num_buckets,
+            relative_max_distance=self.relative_max_distance,
+            disable_ngram_loss=self.disable_ngram_loss,
+            max_position_embeddings=self.max_position_embeddings,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        return (
+            config,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        )
+
+    def check_prepare_lm_labels_via_shift_left(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+        # add casaul pad token mask
+        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                    )
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                    )
+            else:
+                # all items after square
+                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_decoder_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)  # cross-attention + uni-directional self-attention
+
+    def create_and_check_with_lm_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 5)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_causal_lm_decoder(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForCausalLM(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_generate_with_past_key_value_states(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForConditionalGeneration(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_decoder_generate_with_past_key_value_states(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetForCausalLM(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=10, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=10, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetModel(config=config).to(torch_device).half().eval()
+        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_encoder_decoder_shared_weights(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        for model_class in [ProphetNetModel, ProphetNetForConditionalGeneration]:
+            torch.manual_seed(0)
+            model = model_class(config=config).to(torch_device).eval()
+            # load state dict copies weights but does not tie them
+
+            if model_class == ProphetNetForConditionalGeneration:
+                model.prophetnet.encoder.load_state_dict(model.prophetnet.decoder.state_dict(), strict=False)
+            else:
+                model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
+
+            torch.manual_seed(0)
+            tied_config = copy.deepcopy(config)
+            tied_config.tie_encoder_decoder = True
+            tied_model = model_class(config=tied_config).to(torch_device).eval()
+
+            model_result = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            tied_model_result = tied_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # check that models has less parameters
+            self.parent.assertLess(
+                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+            )
+            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+            # check that outputs are equal
+            self.parent.assertTrue(
+                torch.allclose(
+                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
+                )
+            )
+
+            # check that outputs after saving and loading are equal
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tied_model.save_pretrained(tmpdirname)
+                tied_model = model_class.from_pretrained(tmpdirname)
+                tied_model.to(torch_device)
+                tied_model.eval()
+
+                # check that models has less parameters
+                self.parent.assertLess(
+                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+                )
+                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+                tied_model_result = tied_model(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                    decoder_attention_mask=decoder_attention_mask,
+                )
+
+                # check that outputs are equal
+                self.parent.assertTrue(
+                    torch.allclose(
+                        model_result[0][0, :, random_slice_idx],
+                        tied_model_result[0][0, :, random_slice_idx],
+                        atol=1e-4,
+                    )
+                )
+
+    def check_fast_integration(
+        self,
+        config,
+        *args,
+    ):
+        input_ids = torch.tensor([[7, 4, 78, 0, 24, 52, 43]], device=torch_device, dtype=torch.long)
+        decoder_input_ids = torch.tensor([[12, 62, 25, 11, 47, 15, 14]], device=torch_device, dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1, 0, 1, 0, 0]], device=torch_device, dtype=torch.long)
+        decoder_attention_mask = torch.tensor([[1, 1, 1, 0, 0, 1, 0]], device=torch_device, dtype=torch.long)
+        lm_labels = torch.tensor([[62, 25, 11, 47, 15, 14, 24]], device=torch_device, dtype=torch.long)
+        torch.manual_seed(0)
+        config.ngram = 4
+        model = ProphetNetForConditionalGeneration(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                labels=lm_labels,
+            )
+        self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(4.5819, device=torch_device), atol=1e-3))
+
+        expected_logit_slice = torch.tensor(
+            [-0.1565, 0.0418, 0.1207, 0.0030, 0.0665, 0.0467, 0.0412], device=torch_device
+        )
+        self.parent.assertTrue(torch.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
+
+    def check_model_with_attn_mask(self, config, input_ids, decoder_input_ids, *args):
+        model = ProphetNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs_no_mask = model(input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5])
+        attention_mask = torch.ones_like(input_ids)
+        decoder_attention_mask = torch.ones_like(decoder_input_ids)
+
+        attention_mask[:, 5:] = 0
+
+        outputs_with_mask = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+
+        # check encoder
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_no_mask.encoder_last_hidden_state[0, :, 0],
+                outputs_with_mask.encoder_last_hidden_state[0, :5, 0],
+                atol=1e-3,
+            )
+        )
+
+        # check decoder
+        # main stream
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_no_mask.last_hidden_state[0, :, 0], outputs_with_mask.last_hidden_state[0, :5, 0], atol=1e-3
+            )
+        )
+        # predict stream
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_no_mask.last_hidden_state_ngram[0, :5, 0],
+                outputs_with_mask.last_hidden_state_ngram[0, :5, 0],
+                atol=1e-2,
+            )
+        )
+
+    def check_causal_lm_from_pretrained(
+        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, *args
+    ):
+        model = ProphetNetForConditionalGeneration(config).to(torch_device).eval()
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            model.save_pretrained(tmp_dirname)
+            decoder = ProphetNetForCausalLM.from_pretrained(tmp_dirname).to(torch_device)
+
+        encoder_hidden_states = model.prophetnet.encoder(input_ids).last_hidden_state
+
+        model_outputs = model(
+            encoder_outputs=BaseModelOutput(last_hidden_state=encoder_hidden_states),
+            decoder_input_ids=decoder_input_ids,
+        )
+        dec_outputs = decoder(encoder_hidden_states=encoder_hidden_states, input_ids=decoder_input_ids)
+
+        self.parent.assertTrue(
+            torch.allclose(
+                model_outputs.logits[0, :5],
+                dec_outputs.logits[0, :5],
+                atol=1e-3,
+            )
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
+class ProphetNetStandaloneDecoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        hidden_size=16,
+        encoder_seq_length=7,
+        decoder_seq_length=7,
+        # For common tests
+        is_training=True,
+        is_decoder=True,
+        use_attention_mask=True,
+        add_cross_attention=False,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=0,
+        encoder_ffn_dim=32,
+        num_encoder_layers=4,
+        num_encoder_attention_heads=4,
+        decoder_ffn_dim=32,
+        num_decoder_layers=4,
+        num_decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        ngram=2,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_decoder_layers
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_attention_heads = num_decoder_attention_heads
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.use_cache = use_cache
+        self.disable_ngram_loss = disable_ngram_loss
+        self.max_position_embeddings = max_position_embeddings
+        self.add_cross_attention = add_cross_attention
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 2
+        self.num_hidden_states_types = 2  # decoder_main, decoder_ngram
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        config = ProphetNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_encoder_layers=self.num_encoder_layers,
+            num_decoder_layers=self.num_decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            num_encoder_attention_heads=self.num_encoder_attention_heads,
+            num_decoder_attention_heads=self.num_decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            ngram=self.ngram,
+            num_buckets=self.num_buckets,
+            relative_max_distance=self.relative_max_distance,
+            disable_ngram_loss=self.disable_ngram_loss,
+            max_position_embeddings=self.max_position_embeddings,
+            add_cross_attention=self.add_cross_attention,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = self.prepare_config_and_inputs()
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            lm_labels,
+        )
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        config.use_cache = True
+        model = ProphetNetDecoder(config=config).to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        past_key_values = outputs["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        lm_labels,
+    ):
+        model = ProphetNetDecoder(config=config).to(torch_device).eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+class ProphetNetStandaloneEncoderModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        hidden_size=16,
+        encoder_seq_length=7,
+        decoder_seq_length=7,
+        # For common tests
+        is_training=True,
+        is_decoder=False,
+        use_attention_mask=True,
+        add_cross_attention=False,
+        use_cache=False,
+        use_labels=True,
+        decoder_start_token_id=0,
+        encoder_ffn_dim=32,
+        num_encoder_layers=4,
+        num_encoder_attention_heads=4,
+        decoder_ffn_dim=32,
+        num_decoder_layers=4,
+        num_decoder_attention_heads=4,
+        max_position_embeddings=30,
+        is_encoder_decoder=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        num_buckets=32,
+        relative_max_distance=128,
+        disable_ngram_loss=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_decoder_layers
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_attention_heads = num_decoder_attention_heads
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.use_cache = use_cache
+        self.disable_ngram_loss = disable_ngram_loss
+        self.max_position_embeddings = max_position_embeddings
+        self.add_cross_attention = add_cross_attention
+        self.is_encoder_decoder = is_encoder_decoder
+
+        self.scope = None
+        self.decoder_key_length = decoder_seq_length
+        self.base_model_out_len = 1
+        self.num_hidden_states_types = 1
+        self.decoder_attention_idx = 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        config = ProphetNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_encoder_layers=self.num_encoder_layers,
+            num_decoder_layers=self.num_decoder_layers,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            num_encoder_attention_heads=self.num_encoder_attention_heads,
+            num_decoder_attention_heads=self.num_decoder_attention_heads,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            use_cache=self.use_cache,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+            num_buckets=self.num_buckets,
+            relative_max_distance=self.relative_max_distance,
+            disable_ngram_loss=self.disable_ngram_loss,
+            max_position_embeddings=self.max_position_embeddings,
+            add_cross_attention=self.add_cross_attention,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (ProphetNetModel, ProphetNetForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (ProphetNetForConditionalGeneration,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+
+    def setUp(self):
+        self.model_tester = ProphetNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_lm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
+
+    def test_only_decoder_causal_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_causal_lm_decoder(*config_and_inputs)
+
+    def test_fast_integration(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_fast_integration(*config_and_inputs)
+
+    def test_shared_weights(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
+
+    def test_shift_labels_via_shift_left(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
+
+    def test_decoder_model_generate(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_generate_with_past_key_value_states(*config_and_inputs)
+
+    def test_encoder_decoder_model_generate(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_generate_with_past_key_value_states(*config_and_inputs)
+
+    def test_attn_mask_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_model_with_attn_mask(*config_and_inputs)
+
+    def test_config_save(self):
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        config.add_cross_attention = False
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            config.save_pretrained(tmp_dirname)
+            config = ProphetNetConfig.from_pretrained(tmp_dirname)
+
+        self.assertFalse(config.add_cross_attention)
+
+    def test_causal_lm_from_pretrained(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_causal_lm_from_pretrained(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    # methods overwrite method in `test_modeling_common.py`
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            correct_outlen = 7
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    (self.model_tester.ngram + 1) * decoder_seq_length,
+                    encoder_key_length,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        inputs = self._prepare_for_class(inputs_dict, model_class)
+
+        outputs = model(**inputs)
+        output = outputs[0]
+
+        encoder_hidden_states = outputs.encoder_hidden_states[0]
+        encoder_attentions = outputs.encoder_attentions[0]
+        encoder_hidden_states.retain_grad()
+        encoder_attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(encoder_hidden_states.grad)
+        self.assertIsNotNone(encoder_attentions.grad)
+
+    def test_generate_with_head_masking(self):
+        """Generating with head_masking has not been implemented for ProphetNet models yet."""
+        pass
+
+
+@require_torch
+class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (ProphetNetDecoder, ProphetNetForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (ProphetNetForCausalLM,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = False
+
+    def setUp(self):
+        self.model_tester = ProphetNetStandaloneDecoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_attn_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # decoder cannot keep gradients
+        return
+
+
+@require_torch
+class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ProphetNetEncoder,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = False
+
+    def setUp(self):
+        self.model_tester = ProphetNetStandaloneEncoderModelTester(self, is_training=False)
+        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+
+@require_torch
+class ProphetNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_pretrained_checkpoint_hidden_states(self):
+        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
+        model.to(torch_device)
+
+        # encoder-decoder outputs
+        encoder_ids = torch.tensor(
+            [
+                [
+                    2871,
+                    102,
+                    2048,
+                    3176,
+                    2780,
+                    1997,
+                    2871,
+                    26727,
+                    2169,
+                    2097,
+                    12673,
+                    1996,
+                    8457,
+                    2006,
+                    2049,
+                    8240,
+                    2859,
+                    2799,
+                    1012,
+                    2023,
+                    6512,
+                    2038,
+                    2174,
+                    13977,
+                    2195,
+                    25962,
+                    1012,
+                    102,
+                ]
+            ]
+        ).to(torch_device)
+
+        decoder_prev_ids = torch.tensor([[102, 2129, 2116, 2372, 2024, 2006, 2169, 1997, 2122, 2048, 2780, 1029]]).to(
+            torch_device
+        )
+        output = model(
+            input_ids=encoder_ids,
+            attention_mask=None,
+            encoder_outputs=None,
+            decoder_input_ids=decoder_prev_ids,
+        )
+        output_predited_logits = output[0]
+        expected_shape = torch.Size((1, 12, 30522))
+        self.assertEqual(output_predited_logits.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-7.6213, -7.9008, -7.9979], [-7.6834, -7.8467, -8.2187], [-7.5326, -7.4762, -8.1914]]]
+        ).to(torch_device)
+        #        self.assertTrue(torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4))
+        assert torch.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4)
+
+        # encoder outputs
+        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
+        expected_encoder_outputs_slice = torch.tensor(
+            [[[-0.2526, -0.1951, -0.2185], [-0.8923, 0.2992, -0.4623], [-0.4585, 0.0165, -0.6652]]]
+        ).to(torch_device)
+        expected_shape_encoder = torch.Size((1, 28, 1024))
+        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
+        #        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
+        assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)
+
+        # decoder outputs
+        decoder_outputs = model.prophetnet.decoder(decoder_prev_ids, encoder_hidden_states=encoder_outputs)
+        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
+        predicting_streams_logits = model.lm_head(predicting_streams)
+        next_first_stream_logits = predicting_streams_logits[:, 0]
+        #        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
+        assert torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4)
+
+    @slow
+    def test_cnndm_inference(self):
+        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
+        model.config.max_length = 512
+        model.to(torch_device)
+
+        tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
+
+        ARTICLE_TO_SUMMARIZE = "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a high-level science and technology workforce, as deemed critical for development of China's economy, defense, and science and technology education. The establishment was hailed as \"A Major Event in the History of Chinese Education and Science.\" CAS has supported USTC by combining most of its institutes with the departments of the university. USTC is listed in the top 16 national key universities, becoming the youngest national key university.".lower()
+        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=511, return_tensors="pt").input_ids
+
+        input_ids = input_ids.to(torch_device)
+
+        summary_ids = model.generate(
+            input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        EXPECTED_SUMMARIZE_512 = "us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the top 16 national key universities ."
+        generated_titles = [
+            " ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
+        ]
+        self.assertListEqual(
+            [EXPECTED_SUMMARIZE_512],
+            generated_titles,
+        )
+        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=99, return_tensors="pt").input_ids
+        input_ids = input_ids.to(torch_device)
+        # actually 98 tokens are used. max_length=100 contains bos and eos.
+        summary_ids = model.generate(
+            input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        EXPECTED_SUMMARIZE_100 = (
+            r"us ##tc was founded in beijing by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc "
+            "'"
+            ' s founding mission was to develop a high - level science and technology workforce . [X_SEP] establishment hailed as " a major event in the history of chinese education and science "'
+        )
+        generated_titles = [
+            " ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
+        ]
+        self.assertListEqual(
+            [EXPECTED_SUMMARIZE_100],
+            generated_titles,
+        )
+
+    @slow
+    def test_question_gen_inference(self):
+        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-squad-qg")
+        model.to(torch_device)
+
+        tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-squad-qg")
+
+        INPUTS = [
+            "Bill Gates [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
+            "1975 [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
+            "April 4, 1975 [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
+        ]
+
+        input_ids = tokenizer(INPUTS, truncation=True, padding=True, return_tensors="pt").input_ids
+        input_ids = input_ids.to(torch_device)
+
+        gen_output = model.generate(input_ids, num_beams=5, early_stopping=True)
+        generated_questions = tokenizer.batch_decode(gen_output, skip_special_tokens=True)
+
+        EXPECTED_QUESTIONS = [
+            "along with paul allen, who founded microsoft?",
+            "what year was microsoft founded?",
+            "on what date was microsoft founded?",
+        ]
+
+        self.assertListEqual(
+            EXPECTED_QUESTIONS,
+            generated_questions,
+        )
diff --git a/test_modeling_rag.py b/test_modeling_rag.py
new file mode 100644
index 0000000000000000000000000000000000000000..15bbea5237313de39858a7f54e476a2c6f283c73
--- /dev/null
+++ b/test_modeling_rag.py
@@ -0,0 +1,1170 @@
+# coding=utf-8
+# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+
+from transformers import BartTokenizer, T5Tokenizer
+from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_torch_available
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
+from transformers.models.dpr.tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    require_torch_non_multi_gpu,
+    slow,
+    torch_device,
+)
+
+from .test_modeling_bart import BartModelTester
+from .test_modeling_dpr import DPRModelTester
+from .test_modeling_t5 import T5ModelTester
+
+
+TOLERANCE = 1e-3
+
+T5_SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+if is_torch_available() and is_datasets_available() and is_faiss_available():
+    import torch
+    from datasets import Dataset
+
+    import faiss
+    from transformers import (
+        AutoConfig,
+        AutoModel,
+        AutoModelForSeq2SeqLM,
+        DPRContextEncoder,
+        RagConfig,
+        RagModel,
+        RagRetriever,
+        RagSequenceForGeneration,
+        RagTokenForGeneration,
+        RagTokenizer,
+    )
+    from transformers.modeling_outputs import BaseModelOutput
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def require_retrieval(test_case):
+    """
+    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
+    :class:`~transformers.RagRetriever`.
+
+    These tests are skipped when respective libraries are not installed.
+
+    """
+    if not (is_torch_available() and is_datasets_available() and is_faiss_available()):
+        test_case = unittest.skip("test requires PyTorch, datasets and faiss")(test_case)
+    return test_case
+
+
+@require_torch
+@require_retrieval
+@require_sentencepiece
+class RagTestMixin:
+
+    all_model_classes = (
+        (RagModel, RagTokenForGeneration, RagSequenceForGeneration)
+        if is_torch_available() and is_datasets_available() and is_faiss_available()
+        else ()
+    )
+
+    retrieval_vector_size = 32
+    n_docs = 3
+    max_combined_length = 16
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        # DPR tok
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
+        os.makedirs(dpr_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+        # BART tok
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
+        os.makedirs(bart_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+        t5_tokenizer = T5Tokenizer(T5_SAMPLE_VOCAB)
+        t5_tokenizer_path = os.path.join(self.tmpdirname, "t5_tokenizer")
+        t5_tokenizer.save_pretrained(t5_tokenizer_path)
+
+    @cached_property
+    def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
+        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
+
+    @cached_property
+    def dpr_ctx_encoder_tokenizer(self) -> DPRContextEncoderTokenizer:
+        return DPRContextEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
+
+    @cached_property
+    def bart_tokenizer(self) -> BartTokenizer:
+        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
+
+    @cached_property
+    def t5_tokenizer(self) -> BartTokenizer:
+        return T5Tokenizer.from_pretrained(os.path.join(self.tmpdirname, "t5_tokenizer"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def get_retriever(self, config):
+        dataset = Dataset.from_dict(
+            {
+                "id": ["0", "1", "3"],
+                "text": ["foo", "bar", "qux"],
+                "title": ["Foo", "Bar", "Qux"],
+                "embeddings": [
+                    np.ones(self.retrieval_vector_size),
+                    2 * np.ones(self.retrieval_vector_size),
+                    3 * np.ones(self.retrieval_vector_size),
+                ],
+            }
+        )
+        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
+        tokenizer = self.bart_tokenizer if config.generator.model_type == "bart" else self.t5_tokenizer
+        with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
+            mock_load_dataset.return_value = dataset
+            retriever = RagRetriever(
+                config,
+                question_encoder_tokenizer=self.dpr_tokenizer,
+                generator_tokenizer=tokenizer,
+            )
+        return retriever
+
+    def check_model_with_retriever(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config, retriever=self.get_retriever(config)).to(torch_device)
+            model.eval()
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def check_model_with_end2end_retriever(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        context_encoder_tokenizer = self.dpr_ctx_encoder_tokenizer
+        dpr_context_encoder = DPRContextEncoder(config.question_encoder)  # dpr is a twin tower
+
+        retriever = self.get_retriever(config)
+        retriever.set_ctx_encoder_tokenizer(context_encoder_tokenizer)  # setting the ctx_encoder_tokenizer.
+
+        for model_class in [RagTokenForGeneration, RagSequenceForGeneration]:
+            model = model_class(config, retriever=retriever)
+            model.set_context_encoder_for_training(dpr_context_encoder)  # set the context_encoder for training
+            model.to(torch_device)
+            model.eval()
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def check_model_generate_from_context_input_ids(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="pt",
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            # cast
+            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states)
+            context_input_ids = context_input_ids.to(input_ids)
+            context_attention_mask = context_attention_mask.to(input_ids)
+
+            # compute doc_scores
+            doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)).squeeze(
+                1
+            )
+
+            outputs = model.generate(
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                do_deduplication=True,
+            )
+
+            self.assertIsNotNone(outputs)
+
+    def check_model_generate(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        for model_class in self.all_model_classes[1:]:
+            model = model_class(config, retriever=self.get_retriever(config)).to(torch_device)
+            model.eval()
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            outputs = model.generate(
+                input_ids=input_ids,
+                num_beams=2,
+                num_return_sequences=2,
+                decoder_start_token_id=config.generator.eos_token_id,
+            )
+
+            self.assertIsNotNone(outputs)
+
+    def check_model_without_retriever(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="pt",
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            # cast
+            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states)
+            context_input_ids = context_input_ids.to(input_ids)
+            context_attention_mask = context_attention_mask.to(input_ids)
+
+            # compute doc_scores
+            doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)).squeeze(
+                1
+            )
+
+            outputs = model(
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def check_model_custom_n_docs(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, n_docs, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="pt",
+                n_docs=n_docs,
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            # cast
+            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states)
+            context_input_ids = context_input_ids.to(input_ids)
+            context_attention_mask = context_attention_mask.to(input_ids)
+
+            # compute doc_scores
+            doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)).squeeze(
+                1
+            )
+
+            outputs = model(
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                n_docs=n_docs,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], n_docs))
+
+    def check_model_with_mismatch_n_docs_value(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        retriever_n_docs,
+        generator_n_docs,
+        **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.eval()
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="pt",
+                n_docs=retriever_n_docs,
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            # cast
+            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states)
+            context_input_ids = context_input_ids.to(input_ids)
+            context_attention_mask = context_attention_mask.to(input_ids)
+
+            # compute doc_scores
+            doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), retrieved_doc_embeds.transpose(1, 2)).squeeze(
+                1
+            )
+
+            self.assertRaises(
+                AssertionError,
+                model.__call__,
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                n_docs=generator_n_docs,
+            )
+
+    def check_model_with_encoder_outputs(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config, retriever=self.get_retriever(config)).to(torch_device)
+            model.eval()
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            encoder_outputs = BaseModelOutput(outputs.generator_enc_last_hidden_state)
+
+            # run only generator
+            outputs = model(
+                encoder_outputs=encoder_outputs,
+                doc_scores=outputs.doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def test_model_with_retriever(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_with_retriever(**inputs_dict)
+
+    def test_model_with_end2end_retriever(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_with_end2end_retriever(**inputs_dict)
+
+    def test_model_without_retriever(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_without_retriever(**inputs_dict)
+
+    def test_model_with_encoder_outputs(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_with_encoder_outputs(**inputs_dict)
+
+    def test_model_generate(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_generate(**inputs_dict)
+
+    def test_model_with_custom_n_docs(self):
+        inputs_dict = self.config_and_inputs
+        inputs_dict["n_docs"] = 1
+        self.check_model_custom_n_docs(**inputs_dict)
+
+    def test_model_with_mismatch_n_docs_value(self):
+        inputs_dict = self.config_and_inputs
+        inputs_dict["retriever_n_docs"] = 3
+        inputs_dict["generator_n_docs"] = 2
+        self.check_model_with_mismatch_n_docs_value(**inputs_dict)
+
+
+@require_torch
+@require_retrieval
+class RagDPRBartTest(RagTestMixin, unittest.TestCase):
+    @cached_property
+    def config_and_inputs(self):
+        question_encoder_tester = DPRModelTester(self)
+        dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
+        generator_tester = BartModelTester(self)
+        bart_config_and_inputs = generator_tester.prepare_config_and_inputs_for_common()
+
+        (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
+        (generator_config, bart_inputs_dict) = bart_config_and_inputs
+        decoder_input_ids, decoder_attention_mask = bart_inputs_dict["input_ids"], bart_inputs_dict["attention_mask"]
+
+        config = RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            n_docs=self.n_docs,
+            retrieval_vector_size=self.retrieval_vector_size,
+            max_combined_length=self.max_combined_length,
+        )
+
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+        }
+
+
+@require_torch
+@require_retrieval
+class RagDPRT5Test(RagTestMixin, unittest.TestCase):
+    @cached_property
+    def config_and_inputs(self):
+        question_encoder_tester = DPRModelTester(self)
+        dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
+        generator_tester = T5ModelTester(self, vocab_size=1100)
+        t5_config_and_inputs = generator_tester.prepare_config_and_inputs()
+
+        (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
+        (generator_config, _, decoder_input_ids, _, decoder_attention_mask, _) = t5_config_and_inputs
+        config = RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            n_docs=self.n_docs,
+            retrieval_vector_size=self.retrieval_vector_size,
+            max_combined_length=self.max_combined_length,
+        )
+
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+        }
+
+
+@require_torch
+@require_retrieval
+@require_sentencepiece
+@require_tokenizers
+@require_torch_non_multi_gpu
+class RagModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def sequence_model(self):
+        return (
+            RagSequenceForGeneration.from_pretrained_question_encoder_generator(
+                "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
+            )
+            .to(torch_device)
+            .eval()
+        )
+
+    @cached_property
+    def token_model(self):
+        return (
+            RagTokenForGeneration.from_pretrained_question_encoder_generator(
+                "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
+            )
+            .to(torch_device)
+            .eval()
+        )
+
+    def get_rag_config(self):
+        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
+        return RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            bos_token_id=0,
+            decoder_start_token_id=2,
+            eos_token_id=2,
+            is_encoder_decoder=True,
+            pad_token_id=1,
+            vocab_size=50264,
+            title_sep=" / ",
+            doc_sep=" // ",
+            n_docs=5,
+            max_combined_length=300,
+            dataset="wiki_dpr",
+            dataset_split="train",
+            index_name="exact",
+            index_path=None,
+            use_dummy_dataset=True,
+            retrieval_vector_size=768,
+            retrieval_batch_size=8,
+        )
+
+    @slow
+    def test_rag_sequence_inference(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_sequence = self.sequence_model
+        rag_sequence.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="pt"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
+
+        input_ids = input_ids.to(torch_device)
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        with torch.no_grad():
+            output = rag_sequence(
+                input_ids,
+                labels=decoder_input_ids,
+            )
+
+        expected_shape = torch.Size([5, 5, 50264])
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_doc_scores = torch.tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]]).to(torch_device)
+        _assert_tensors_equal(expected_doc_scores, output.doc_scores, atol=TOLERANCE)
+
+        expected_loss = torch.tensor([36.7368]).to(torch_device)
+        _assert_tensors_equal(expected_loss, output.loss, atol=TOLERANCE)
+
+    @slow
+    def test_rag_token_inference(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_token = self.token_model
+        rag_token.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="pt"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
+
+        input_ids = input_ids.to(torch_device)
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        with torch.no_grad():
+            output = rag_token(
+                input_ids,
+                labels=decoder_input_ids,
+            )
+
+        expected_shape = torch.Size([5, 5, 50264])
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_doc_scores = torch.tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]]).to(torch_device)
+        _assert_tensors_equal(expected_doc_scores, output.doc_scores, atol=TOLERANCE)
+
+        expected_loss = torch.tensor([36.3557]).to(torch_device)
+        _assert_tensors_equal(expected_loss, output.loss, atol=TOLERANCE)
+
+    @slow
+    def test_rag_token_generate_beam(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_token = self.token_model
+        rag_token.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="pt"
+        ).input_ids
+
+        input_ids = input_ids.to(torch_device)
+
+        output_ids = rag_token.generate(
+            input_ids,
+            decoder_start_token_id=rag_token.generator.config.decoder_start_token_id,
+            num_beams=2,
+            num_return_sequences=2,
+        )
+        # sequence generate test
+        output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
+
+        # Expected outputs as given by model at integration time.
+        EXPECTED_OUTPUT_TEXT_1 = "\"She's My Kind of Girl"
+        EXPECTED_OUTPUT_TEXT_2 = "\"She's My Kind of Love"
+
+        self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
+        self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
+
+    @slow
+    def test_rag_sequence_generate_beam(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_sequence = self.sequence_model
+        rag_sequence.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="pt"
+        ).input_ids
+
+        input_ids = input_ids.to(torch_device)
+
+        output_ids = rag_sequence.generate(
+            input_ids,
+            decoder_start_token_id=rag_sequence.generator.config.decoder_start_token_id,
+            num_beams=2,
+            num_return_sequences=2,
+        )
+        # sequence generate test
+        output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
+
+        # Expected outputs as given by model at integration time.
+        EXPECTED_OUTPUT_TEXT_1 = """\"She's My Kind of Girl\" was released through Epic Records in Japan in March 1972, giving the duo a Top 10 hit. Two more singles were released in Japan, \"En Carousel\" and \"Love Has Its Ways\" Ulvaeus and Andersson persevered with their songwriting and experimented with new sounds and vocal arrangements."""
+        EXPECTED_OUTPUT_TEXT_2 = """In September 2018, Björn Ulvaeus revealed that the two new songs, \"I Still Have Faith In You\" and \"Don't Shut Me Down\", would be released no earlier than March 2019. The two new tracks will feature in a TV special set to air later in the year."""
+
+        self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
+        self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
+
+    @property
+    def test_data_questions(self):
+        return [
+            "who got the first nobel prize in physics",
+            "when is the next deadpool movie being released",
+            "which mode is used for short wave broadcast service",
+            "who is the owner of reading football club",
+            "when is the next scandal episode coming out",
+            "when is the last time the philadelphia won the superbowl",
+            "what is the most current adobe flash player version",
+            "how many episodes are there in dragon ball z",
+        ]
+
+    @slow
+    def test_rag_sequence_generate_batch(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        retriever = RagRetriever.from_pretrained(
+            "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
+        )
+        rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
+            torch_device
+        )
+
+        input_dict = tokenizer(
+            self.test_data_questions,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids.to(torch_device)
+        attention_mask = input_dict.attention_mask.to(torch_device)
+
+        output_ids = rag_sequence.generate(
+            input_ids,
+            attention_mask=attention_mask,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " june 22, 2018",
+            " amplitude modulation",
+            " tim besley ( chairman )",
+            " june 20, 2018",
+            " 1980",
+            " 7.0",
+            " 8",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+    @slow
+    def test_rag_sequence_generate_batch_from_context_input_ids(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        retriever = RagRetriever.from_pretrained(
+            "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
+        )
+        rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever).to(
+            torch_device
+        )
+
+        input_dict = tokenizer(
+            self.test_data_questions,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids.to(torch_device)
+        attention_mask = input_dict.attention_mask.to(torch_device)
+
+        question_hidden_states = rag_sequence.question_encoder(input_ids, attention_mask=attention_mask)[0]
+        docs_dict = retriever(
+            input_ids.cpu().detach().numpy(), question_hidden_states.cpu().detach().numpy(), return_tensors="pt"
+        )
+        doc_scores = torch.bmm(
+            question_hidden_states.unsqueeze(1),
+            docs_dict["retrieved_doc_embeds"].to(torch_device).float().transpose(1, 2),
+        ).squeeze(1)
+
+        output_ids = rag_sequence.generate(
+            context_input_ids=docs_dict["context_input_ids"].to(torch_device),
+            context_attention_mask=docs_dict["context_attention_mask"].to(torch_device),
+            doc_scores=doc_scores.to(torch_device),
+            do_deduplication=True,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " june 22, 2018",
+            " amplitude modulation",
+            " tim besley ( chairman )",
+            " june 20, 2018",
+            " 1980",
+            " 7.0",
+            " 8",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+    @slow
+    def test_rag_token_generate_batch(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to(
+            torch_device
+        )
+
+        input_dict = tokenizer(
+            self.test_data_questions,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids.to(torch_device)
+        attention_mask = input_dict.attention_mask.to(torch_device)
+
+        output_ids = rag_token.generate(
+            input_ids,
+            attention_mask=attention_mask,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " september 22, 2017",
+            " amplitude modulation",
+            " stefan persson",
+            " april 20, 2018",
+            " the 1970s",
+            " 7.1. 2",
+            " 13",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+
+@require_torch
+@require_retrieval
+class RagModelSaveLoadTests(unittest.TestCase):
+    def get_rag_config(self):
+        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
+        return RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            bos_token_id=0,
+            decoder_start_token_id=2,
+            eos_token_id=2,
+            is_encoder_decoder=True,
+            pad_token_id=1,
+            vocab_size=50264,
+            title_sep=" / ",
+            doc_sep=" // ",
+            n_docs=5,
+            max_combined_length=300,
+            dataset="wiki_dpr",
+            dataset_split="train",
+            index_name="exact",
+            index_path=None,
+            use_dummy_dataset=True,
+            retrieval_vector_size=768,
+            retrieval_batch_size=8,
+        )
+
+    @slow
+    def test_rag_sequence_from_pretrained(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="pt"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
+
+        input_ids = input_ids.to(torch_device)
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            rag_sequence = RagSequenceForGeneration.from_pretrained_question_encoder_generator(
+                "facebook/dpr-question_encoder-single-nq-base",
+                "facebook/bart-large-cnn",
+                retriever=rag_retriever,
+                config=rag_config,
+            ).to(torch_device)
+            # check that the from pretrained methods work
+            rag_sequence.save_pretrained(tmp_dirname)
+            rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever)
+            rag_sequence.to(torch_device)
+
+            with torch.no_grad():
+                output = rag_sequence(
+                    input_ids,
+                    labels=decoder_input_ids,
+                )
+
+            loss_pretrained = output.loss
+            del rag_sequence
+
+        question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+        rag_sequence = RagSequenceForGeneration(
+            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
+        )
+        rag_sequence.to(torch_device)
+
+        with torch.no_grad():
+            output = rag_sequence(
+                input_ids,
+                labels=decoder_input_ids,
+            )
+
+        loss_init = output.loss
+
+        self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
+
+    @slow
+    def test_rag_token_from_pretrained(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="pt"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="pt").input_ids
+
+        input_ids = input_ids.to(torch_device)
+        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            rag_token = RagTokenForGeneration.from_pretrained_question_encoder_generator(
+                "facebook/dpr-question_encoder-single-nq-base",
+                "facebook/bart-large-cnn",
+                retriever=rag_retriever,
+                config=rag_config,
+                question_encoder_max_length=200,
+                generator_max_length=200,
+            ).to(torch_device)
+            # check that the from pretrained methods work
+            rag_token.save_pretrained(tmp_dirname)
+            rag_token.from_pretrained(tmp_dirname, retriever=rag_retriever)
+            rag_token.to(torch_device)
+
+            self.assertTrue(rag_token.question_encoder.config.max_length == 200)
+            self.assertTrue(rag_token.generator.config.max_length == 200)
+
+            with torch.no_grad():
+                output = rag_token(
+                    input_ids,
+                    labels=decoder_input_ids,
+                )
+
+            loss_pretrained = output.loss
+            del rag_token
+
+        question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+        rag_token = RagTokenForGeneration(
+            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
+        )
+        rag_token.to(torch_device)
+
+        with torch.no_grad():
+            output = rag_token(
+                input_ids,
+                labels=decoder_input_ids,
+            )
+
+        loss_init = output.loss
+
+        self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
diff --git a/test_modeling_reformer.py b/test_modeling_reformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e5129a10d89664dd2f5bb13a1650d02b29cf70
--- /dev/null
+++ b/test_modeling_reformer.py
@@ -0,0 +1,1281 @@
+# coding=utf-8 # Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    require_torch_multi_gpu,
+    slow,
+    torch_device,
+)
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        ReformerConfig,
+        ReformerForMaskedLM,
+        ReformerForQuestionAnswering,
+        ReformerForSequenceClassification,
+        ReformerLayer,
+        ReformerModel,
+        ReformerModelWithLMHead,
+        ReformerTokenizer,
+    )
+
+
+class ReformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=None,
+        seq_length=None,
+        is_training=None,
+        is_decoder=None,
+        use_input_mask=None,
+        use_labels=None,
+        vocab_size=None,
+        attention_head_size=None,
+        hidden_size=None,
+        num_attention_heads=None,
+        local_attn_chunk_length=None,
+        local_num_chunks_before=None,
+        local_num_chunks_after=None,
+        num_buckets=None,
+        num_hashes=1,
+        lsh_attn_chunk_length=None,
+        lsh_num_chunks_before=None,
+        lsh_num_chunks_after=None,
+        chunk_size_lm_head=None,
+        chunk_size_feed_forward=None,
+        feed_forward_size=None,
+        hidden_act=None,
+        hidden_dropout_prob=None,
+        local_attention_probs_dropout_prob=None,
+        lsh_attention_probs_dropout_prob=None,
+        max_position_embeddings=None,
+        initializer_range=None,
+        axial_norm_std=None,
+        layer_norm_eps=None,
+        axial_pos_embds=None,
+        axial_pos_shape=None,
+        axial_pos_embds_dim=None,
+        attn_layers=None,
+        pad_token_id=None,
+        eos_token_id=None,
+        scope=None,
+        hash_seed=None,
+        num_labels=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.is_decoder = is_decoder
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.attention_head_size = attention_head_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = len(attn_layers)
+        self.local_attn_chunk_length = local_attn_chunk_length
+        self.local_num_chunks_after = local_num_chunks_after
+        self.local_num_chunks_before = local_num_chunks_before
+        self.num_hashes = num_hashes
+        self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
+        self.lsh_attn_chunk_length = lsh_attn_chunk_length
+        self.lsh_num_chunks_after = lsh_num_chunks_after
+        self.lsh_num_chunks_before = lsh_num_chunks_before
+        self.hidden_act = hidden_act
+        self.feed_forward_size = feed_forward_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
+        self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.axial_pos_embds = axial_pos_embds
+        self.axial_pos_shape = tuple(axial_pos_shape)
+        self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
+        self.axial_norm_std = axial_norm_std
+        self.chunk_size_lm_head = chunk_size_lm_head
+        self.chunk_size_feed_forward = chunk_size_feed_forward
+        self.scope = scope
+        self.attn_layers = attn_layers
+        self.pad_token_id = pad_token_id
+        self.hash_seed = hash_seed
+
+        attn_chunk_length = local_attn_chunk_length if local_attn_chunk_length is not None else lsh_attn_chunk_length
+        num_chunks_after = local_num_chunks_after if local_num_chunks_after is not None else lsh_num_chunks_after
+        num_chunks_before = local_num_chunks_before if local_num_chunks_before is not None else lsh_num_chunks_before
+
+        self.encoder_seq_length = seq_length // attn_chunk_length + (self.seq_length % attn_chunk_length != 0)
+        self.key_length = (num_chunks_before + num_chunks_after + 1) * attn_chunk_length
+        self.chunk_length = attn_chunk_length
+        self.num_labels = num_labels
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        choice_labels = None
+        if self.use_labels:
+            choice_labels = ids_tensor([self.batch_size], 2)
+
+        config = ReformerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            feed_forward_size=self.feed_forward_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            local_attention_probs_dropout_prob=self.local_attention_probs_dropout_prob,
+            lsh_attention_probs_dropout_prob=self.lsh_attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            is_decoder=self.is_decoder,
+            axial_pos_embds=self.axial_pos_embds,
+            axial_pos_shape=self.axial_pos_shape,
+            axial_pos_embds_dim=self.axial_pos_embds_dim,
+            local_attn_chunk_length=self.local_attn_chunk_length,
+            local_num_chunks_after=self.local_num_chunks_after,
+            local_num_chunks_before=self.local_num_chunks_before,
+            num_hashes=self.num_hashes,
+            num_buckets=self.num_buckets,
+            lsh_attn_chunk_length=self.lsh_attn_chunk_length,
+            lsh_num_chunks_after=self.lsh_num_chunks_after,
+            lsh_num_chunks_before=self.lsh_num_chunks_before,
+            attn_layers=self.attn_layers,
+            pad_token_id=self.pad_token_id,
+            hash_seed=self.hash_seed,
+        )
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            choice_labels,
+        )
+
+    def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels):
+        model = ReformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+
+        # 2 * hidden_size because we use reversible resnet layers
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.seq_length, 2 * self.hidden_size)
+        )
+
+    def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, input_mask, choice_labels):
+        if not self.is_training:
+            return
+
+        config.is_decoder = False
+        config.lsh_num_chunks_after = 1
+        model = ReformerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.train()
+        loss = model(input_ids, attention_mask=input_mask, labels=input_ids)["loss"]
+        loss.backward()
+
+    def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choice_labels):
+        config.lsh_num_chunks_after = 0
+        config.is_decoder = True
+        model = ReformerModelWithLMHead(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_reformer_with_mlm(self, config, input_ids, input_mask, choice_labels):
+        config.is_decoder = False
+        model = ReformerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=input_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_reformer_model_with_attn_mask(
+        self, config, input_ids, input_mask, choice_labels, is_decoder=False
+    ):
+        # no special position embeddings
+        config.axial_pos_embds = False
+        config.is_decoder = is_decoder
+
+        if self.lsh_attn_chunk_length is not None:
+            # need to set chunk length equal sequence length to be certain that chunking works
+            config.lsh_attn_chunk_length = self.seq_length
+
+        model = ReformerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        # set all position encodings to zero so that postions don't matter
+        with torch.no_grad():
+            embedding = model.embeddings.position_embeddings.embedding
+            embedding.weight = nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device))
+            embedding.weight.requires_grad = False
+
+        half_seq_len = self.seq_length // 2
+        roll = self.chunk_length
+
+        half_input_ids = input_ids[:, :half_seq_len]
+
+        # normal padded
+        attn_mask = torch.cat(
+            [torch.ones_like(half_input_ids), torch.zeros_like(half_input_ids)],
+            dim=-1,
+        )
+        input_ids_padded = torch.cat(
+            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
+            dim=-1,
+        )
+
+        # shifted padded
+        input_ids_roll = torch.cat(
+            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
+            dim=-1,
+        )
+        input_ids_roll = torch.roll(input_ids_roll, roll, dims=-1)
+        attn_mask_roll = torch.roll(attn_mask, roll, dims=-1)
+
+        output_padded = model(input_ids_padded, attention_mask=attn_mask)[0][:, :half_seq_len]
+        output_padded_rolled = model(input_ids_roll, attention_mask=attn_mask_roll)[0][:, roll : half_seq_len + roll]
+
+        self.parent.assertTrue(torch.allclose(output_padded, output_padded_rolled, atol=1e-3))
+
+    def create_and_check_reformer_layer_dropout_seed(
+        self, config, input_ids, input_mask, choice_labels, is_decoder=False
+    ):
+        config.is_decoder = is_decoder
+        layer = ReformerLayer(config).to(torch_device)
+        layer.train()
+        shape = (
+            self.batch_size,
+            self.seq_length,
+            config.hidden_size,
+        )  # Batch x SeqLen x hiddenSize
+
+        # get random tensors
+        hidden_states = floats_tensor(shape)
+        prev_attn_output = floats_tensor(shape)
+
+        # now the random seeds for attention and feed forward is initialized
+        # forward tensors with dropout
+        layer_outputs = layer(prev_attn_output, hidden_states, attention_mask=input_mask)
+
+        next_attn_output = layer_outputs.attn_output
+        next_hidden_states = layer_outputs.hidden_states
+
+        torch.manual_seed(layer.attention_seed)
+        attn_outputs = layer.attention(hidden_states, attention_mask=input_mask)
+        self.parent.assertTrue(
+            torch.allclose(
+                prev_attn_output + attn_outputs.hidden_states,
+                next_attn_output,
+                atol=1e-3,
+            )
+        )
+
+        torch.manual_seed(layer.feed_forward_seed)
+        feed_forward_hidden_states = layer.feed_forward(next_attn_output)
+        self.parent.assertTrue(
+            torch.allclose(
+                next_hidden_states,
+                hidden_states + feed_forward_hidden_states,
+                atol=1e-3,
+            )
+        )
+
+    def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels):
+        if not self.is_training:
+            return
+
+        # disable dropout
+        config.hidden_dropout_prob = 0
+        config.local_attention_probs_dropout_prob = 0
+        config.lsh_attention_probs_dropout_prob = 0
+        config.lsh_num_chunks_after = 1
+        config.is_decoder = False
+
+        torch.manual_seed(0)
+        model = ReformerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.train()
+        model.zero_grad()
+        loss_no_chunk, output_no_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
+        loss_no_chunk.backward()
+        grad_slice_word_no_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
+        grad_slice_position_factor_1_no_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
+        grad_slice_position_factor_2_no_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
+
+        config.chunk_size_lm_head = 1
+        config.chunk_size_feed_forward = 1
+
+        torch.manual_seed(0)
+        model = ReformerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.train()
+        model.zero_grad()
+        loss_chunk, output_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
+        loss_chunk.backward()
+        grad_slice_word_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
+        grad_slice_position_factor_1_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
+        grad_slice_position_factor_2_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
+        self.parent.assertTrue(torch.allclose(loss_chunk, loss_no_chunk, atol=1e-3))
+        self.parent.assertTrue(torch.allclose(grad_slice_word_no_chunk, grad_slice_word_chunk, atol=1e-3))
+        self.parent.assertTrue(
+            torch.allclose(grad_slice_position_factor_1_chunk, grad_slice_position_factor_1_no_chunk, atol=1e-3)
+        )
+        self.parent.assertTrue(
+            torch.allclose(grad_slice_position_factor_2_chunk, grad_slice_position_factor_2_no_chunk, atol=1e-3)
+        )
+
+    def create_and_check_reformer_random_seed(self, config, input_ids, input_mask, choice_labels):
+        layer = ReformerLayer(config).to(torch_device)
+        layer.train()
+
+        shape = (
+            self.batch_size,
+            self.seq_length,
+            config.hidden_size,
+        )  # Batch x SeqLen x hiddenSize
+
+        hidden_states = floats_tensor(shape)
+        attn_output = floats_tensor(shape)
+
+        seeds = []
+        for _ in range(100):
+            layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
+            attn_output = layer_outputs.attn_output
+            hidden_states = layer_outputs.hidden_states
+            torch.manual_seed(layer.attention_seed)
+            seeds.append(layer.attention_seed)
+        self.parent.assertGreater(len(set(seeds)), 70)
+
+        seeds = []
+        for _ in range(100):
+            layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
+            attn_output = layer_outputs.attn_output
+            hidden_states = layer_outputs.hidden_states
+            torch.manual_seed(layer.feed_forward_seed)
+            seeds.append(layer.feed_forward_seed)
+        self.parent.assertGreater(len(set(seeds)), 70)
+
+    def create_and_check_reformer_model_fp16_forward(self, config, input_ids, input_mask, choice_labels):
+        model = ReformerModel(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        output = model(input_ids, attention_mask=input_mask)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_reformer_model_generate(self, config, input_ids, input_mask, choice_labels):
+        config.is_decoder = True
+        config.lsh_num_chunks_after = 0
+        config.bos_token_id = 0
+        config.eos_token_id = None
+        config.max_length = 20
+
+        model = ReformerModelWithLMHead(config=config)
+        model.to(torch_device)
+        model.eval()
+        output = model.generate()
+        self.parent.assertIsNotNone(output)
+
+    def create_and_check_reformer_model_fp16_generate(self, config, input_ids, input_mask, choice_labels):
+        config.is_decoder = True
+        config.lsh_num_chunks_after = 0
+        model = ReformerModelWithLMHead(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        # only use last 10 inputs for generation
+        output = model.generate(input_ids[:, -10:], attention_mask=input_mask, do_sample=False)
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_reformer_no_chunking(self, config, input_ids, input_mask, choice_labels):
+        # force chunk length to be bigger than input_ids
+        config.lsh_attn_chunk_length = 2 * input_ids.shape[-1]
+        config.local_attn_chunk_length = 2 * input_ids.shape[-1]
+        config.lsh_num_chunks_after = 1
+        config.is_decoder = False
+        model = ReformerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        output_logits = model(input_ids, attention_mask=input_mask)["logits"]
+        self.parent.assertTrue(output_logits.shape[1] == input_ids.shape[-1])
+
+    def create_and_check_reformer_for_question_answering(self, config, input_ids, input_mask, choice_labels):
+        model = ReformerForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            start_positions=choice_labels,
+            end_positions=choice_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_past_buckets_states(self, config, input_ids, input_mask, choice_labels):
+        config.is_decoder = True
+        config.lsh_num_chunks_before = 1
+        config.lsh_num_chunks_after = 0
+        model = ReformerModelWithLMHead(config=config)
+        model.to(torch_device)
+        model.eval()
+        input_ids_first = input_ids[:, :-1]
+        input_ids_second = input_ids[:, -1:]
+
+        # return saved cache
+        past_buckets_states = model(input_ids_first, use_cache=True)["past_buckets_states"]
+
+        # calculate last output with and without cache
+        outputs_with_cache = model(input_ids_second, past_buckets_states=past_buckets_states, use_cache=True)["logits"]
+        outputs_without_cache = model(input_ids)["logits"][:, -1]
+
+        # select random slice idx
+        random_slice_idx = torch.randint(outputs_without_cache.shape[-1], (1, 1), device=torch_device).item()
+
+        # outputs should be similar within range
+        self.parent.assertTrue(
+            torch.allclose(
+                outputs_with_cache[:, 0, random_slice_idx], outputs_without_cache[:, random_slice_idx], atol=1e-2
+            )
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, choice_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+    def create_and_check_reformer_for_sequence_classification(
+        self, config, input_ids, input_mask, choice_labels, is_decoder
+    ):
+        config.is_decoder = is_decoder
+        sequence_labels = ids_tensor([self.batch_size], config.num_labels)
+        model = ReformerForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+
+class ReformerTesterMixin:
+    """
+    Reformer Local and Reformer LSH run essentially the same tests
+    """
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_reformer_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_model(*config_and_inputs)
+
+    def test_reformer_lm_model_backward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_model_with_lm_backward(*config_and_inputs)
+
+    def test_reformer_model_attn_masking(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, is_decoder=True)
+        self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, is_decoder=False)
+
+    def test_reformer_with_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_with_lm(*config_and_inputs)
+
+    def test_reformer_with_mlm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_with_mlm(*config_and_inputs)
+
+    def test_reformer_layer_training_dropout(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=True)
+        self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=False)
+
+    def test_reformer_chunking_backward_equality(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_feed_backward_chunking(*config_and_inputs)
+
+    def test_reformer_no_chunking(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_no_chunking(*config_and_inputs)
+
+    def test_reformer_qa_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_for_question_answering(*config_and_inputs)
+
+    def test_reformer_cached_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_past_buckets_states(*config_and_inputs)
+
+    def test_reformer_cached_generate(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_model_generate(*config_and_inputs)
+
+    @slow
+    def test_dropout_random_seed_is_changing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_random_seed(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_reformer_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_model_fp16_forward(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_reformer_model_fp16_generate(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)
+
+    @require_torch_multi_gpu
+    def test_multi_gpu_data_parallel_forward(self):
+        # Opt-out of this test.
+        pass
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_reformer_for_sequence_classification(*config_and_inputs, is_decoder=False)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # reformer cannot keep gradients in attentions or hidden states
+        return
+
+    def test_resize_embeddings_untied(self):
+        # reformer cannot resize embeddings that easily
+        return
+
+
+@require_torch
+class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    test_sequence_classification_problem_types = True
+
+    def prepare_kwargs(self):
+        return {
+            "batch_size": 13,
+            "seq_length": 32,
+            "is_training": True,
+            "is_decoder": True,
+            "use_input_mask": True,
+            "use_labels": True,
+            "vocab_size": 32,
+            "attention_head_size": 16,
+            "hidden_size": 32,
+            "num_attention_heads": 2,
+            "local_attn_chunk_length": 4,
+            "local_num_chunks_before": 1,
+            "local_num_chunks_after": 0,
+            "chunk_size_lm_head": 0,
+            "chunk_size_feed_forward": 0,
+            "feed_forward_size": 32,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "local_attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "initializer_range": 0.02,
+            "axial_norm_std": 1.0,
+            "layer_norm_eps": 1e-12,
+            "axial_pos_embds": True,
+            "axial_pos_shape": [4, 8],
+            "axial_pos_embds_dim": [16, 16],
+            "attn_layers": ["local", "local", "local", "local"],
+            "pad_token_id": 0,
+            "eos_token_id": 2,
+            "scope": None,
+            "hash_seed": 0,
+            "num_labels": 2,
+        }
+
+    def setUp(self):
+        tester_kwargs = self.prepare_kwargs()
+        self.model_tester = ReformerModelTester(self, **tester_kwargs)
+        self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ReformerModelWithLMHead.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, list) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx if not use_cache else 1
+            num_chunks = tgt_len // config.local_attn_chunk_length + (tgt_len % config.local_attn_chunk_length != 0)
+            tgt_chunk_len = config.local_attn_chunk_length
+            src_chunk_len = config.local_attn_chunk_length * (
+                1 + config.local_num_chunks_after + config.local_num_chunks_before
+            )
+
+            if use_cache:
+                expected_shape = (
+                    batch_size * num_beam_groups,
+                    config.num_attention_heads,
+                    tgt_len,
+                    min_length // config.local_attn_chunk_length + 1 + idx,
+                )
+            else:
+                expected_shape = (
+                    batch_size * num_beam_groups,
+                    config.num_attention_heads,
+                    num_chunks,
+                    tgt_chunk_len,
+                    src_chunk_len,
+                )
+            # check attn size
+            self.assertListEqual(
+                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
+            )
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, list) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            seq_len = min_length + idx
+            seq_len = config.local_attn_chunk_length * (
+                seq_len // config.local_attn_chunk_length + (seq_len % config.local_attn_chunk_length != 0)
+            )
+
+            if use_cache:
+                seq_len = 1
+
+            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+            # check hidden size
+            self.assertListEqual(
+                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
+                [expected_shape] * len(iter_hidden_states),
+            )
+
+
+@require_torch
+class ReformerLSHAttnModelTest(ReformerTesterMixin, ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (ReformerModelWithLMHead,) if is_torch_available() else ()
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def prepare_kwargs(self):
+        return {
+            "batch_size": 13,
+            "seq_length": 13,
+            "use_input_mask": True,
+            "use_labels": True,
+            "is_training": False,
+            "is_decoder": True,
+            "vocab_size": 32,
+            "attention_head_size": 16,
+            "hidden_size": 64,
+            "num_attention_heads": 2,
+            "num_buckets": 2,
+            "num_hashes": 4,
+            "lsh_attn_chunk_length": 4,
+            "lsh_num_chunks_before": 1,
+            "lsh_num_chunks_after": 0,
+            "chunk_size_lm_head": 5,
+            "chunk_size_feed_forward": 6,
+            "feed_forward_size": 32,
+            "hidden_act": "relu",
+            "hidden_dropout_prob": 0.1,
+            "lsh_attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "initializer_range": 0.02,
+            "axial_norm_std": 1.0,
+            "layer_norm_eps": 1e-12,
+            "axial_pos_embds": True,
+            "axial_pos_shape": [4, 8],
+            "axial_pos_embds_dim": [16, 48],
+            #            sanotheu
+            #            "attn_layers": ["lsh", "lsh", "lsh", "lsh"],
+            "attn_layers": ["lsh"],
+            "pad_token_id": 0,
+            "eos_token_id": 2,
+            "scope": None,
+            "hash_seed": 0,
+            "num_labels": 2,
+        }
+
+    def setUp(self):
+        tester_kwargs = self.prepare_kwargs()
+        self.model_tester = ReformerModelTester(self, **tester_kwargs)
+        self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, list) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx if not use_cache else 1
+            num_chunks = tgt_len // config.lsh_attn_chunk_length + (tgt_len % config.lsh_attn_chunk_length != 0)
+            tgt_chunk_len = config.lsh_attn_chunk_length
+            src_chunk_len = config.lsh_attn_chunk_length * (
+                1 + config.lsh_num_chunks_after + config.lsh_num_chunks_before
+            )
+
+            if use_cache:
+                expected_shape = (
+                    batch_size * num_beam_groups,
+                    config.num_attention_heads,
+                    config.num_hashes,
+                    tgt_len,
+                    config.num_hashes * (1 + config.lsh_num_chunks_after + config.lsh_num_chunks_before),
+                )
+            else:
+                expected_shape = (
+                    batch_size * num_beam_groups,
+                    config.num_attention_heads,
+                    num_chunks * config.num_hashes,
+                    tgt_chunk_len,
+                    src_chunk_len,
+                )
+            # check attn size
+            self.assertListEqual(
+                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
+            )
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, list) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            seq_len = min_length + idx if not use_cache else 1
+            seq_len = config.lsh_attn_chunk_length * (
+                seq_len // config.lsh_attn_chunk_length + (seq_len % config.lsh_attn_chunk_length != 0)
+            )
+
+            if use_cache:
+                seq_len = 1
+
+            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+            # check hidden size
+            self.assertListEqual(
+                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
+                [expected_shape] * len(iter_hidden_states),
+            )
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class ReformerIntegrationTests(unittest.TestCase):
+    """
+    These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "lsh" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
+    """
+
+    def _get_basic_config_and_input(self):
+        config = {
+            "vocab_size": 320,
+            "attention_head_size": 8,
+            "hidden_size": 16,
+            "num_attention_heads": 2,
+            "num_buckets": 2,
+            "num_hashes": 4,
+            "lsh_attn_chunk_length": 4,
+            "local_attn_chunk_length": 4,
+            "lsh_num_chunks_before": 1,
+            "lsh_num_chunks_after": 0,
+            "local_num_chunks_before": 1,
+            "local_num_chunks_after": 0,
+            "chunk_size_lm_head": 0,
+            "chunk_size_feed_forward": 0,
+            "feed_forward_size": 32,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.0,
+            "lsh_attention_probs_dropout_prob": 0.0,
+            "local_attention_probs_dropout_prob": 0.0,
+            "max_position_embeddings": 32,
+            "initializer_range": 0.02,
+            "axial_norm_std": 1.0,
+            "layer_norm_eps": 1e-12,
+            "sinusoidal_pos_embds": False,
+            "axial_pos_embds": True,
+            "axial_pos_shape": [4, 8],
+            "axial_pos_embds_dim": [8, 8],
+            "hash_seed": 0,
+            "is_decoder": True,
+        }
+        return config
+
+    def _get_hidden_states(self):
+        return torch.tensor(
+            [
+                [
+                    [
+                        1.90826353e00,
+                        -1.45999730e00,
+                        -6.20405462e-01,
+                        1.52503433e00,
+                        -3.64464232e-01,
+                        -8.27359235e-01,
+                        8.39670803e-01,
+                        2.44492178e-01,
+                        4.98332758e-01,
+                        2.69175139e00,
+                        -7.08081422e-03,
+                        1.04915401e00,
+                        -1.83476661e00,
+                        7.67220476e-01,
+                        2.98580543e-01,
+                        2.84803992e-02,
+                    ],
+                    [
+                        -2.66374286e-02,
+                        4.33497576e-01,
+                        3.10386309e-01,
+                        5.46039944e-01,
+                        -2.47292666e-04,
+                        -7.52305019e-01,
+                        2.39162103e-01,
+                        7.25216186e-01,
+                        -7.58357372e-01,
+                        4.20635998e-01,
+                        -4.04739919e-02,
+                        1.59924145e-01,
+                        2.05135748e00,
+                        -1.15997978e00,
+                        5.37166397e-01,
+                        2.62873606e-01,
+                    ],
+                    [
+                        1.85247482e-01,
+                        7.07046037e-01,
+                        -6.77089715e-01,
+                        -2.24209655e00,
+                        -3.75307980e-02,
+                        -8.59380874e-01,
+                        -2.81027884e00,
+                        1.01276376e00,
+                        -1.69438001e00,
+                        4.17574660e-01,
+                        -1.49196962e00,
+                        -1.76483717e00,
+                        -1.94566312e-01,
+                        -1.71183858e00,
+                        7.72903565e-01,
+                        -1.11557056e00,
+                    ],
+                    [
+                        9.46069193e-01,
+                        1.53417623e-01,
+                        -9.58686996e-01,
+                        1.18126669e-01,
+                        1.75967724e00,
+                        1.62194590e00,
+                        -5.74108159e-01,
+                        6.79920443e-01,
+                        5.44028163e-01,
+                        2.05466114e-01,
+                        -3.63045868e-01,
+                        2.41865062e-01,
+                        3.20348382e-01,
+                        -9.05611176e-01,
+                        -1.92690727e-01,
+                        -1.19917547e00,
+                    ],
+                ]
+            ],
+            dtype=torch.float32,
+            device=torch_device,
+        )
+
+    def _get_attn_mask(self):
+        return torch.tensor([[0, 1, 0, 0]], dtype=torch.long, device=torch_device)
+
+    def _get_input_ids_and_mask(self):
+        mask = torch.tensor(
+            [
+                [1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1],
+                [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+
+        input_ids = torch.tensor(
+            [
+                [
+                    89,
+                    279,
+                    286,
+                    84,
+                    194,
+                    316,
+                    182,
+                    28,
+                    283,
+                    37,
+                    169,
+                    7,
+                    253,
+                    267,
+                    107,
+                    250,
+                    44,
+                    7,
+                    102,
+                    62,
+                    3,
+                    243,
+                    171,
+                    265,
+                    302,
+                    48,
+                    164,
+                    264,
+                    148,
+                    229,
+                    280,
+                    150,
+                ],
+                [
+                    9,
+                    192,
+                    66,
+                    112,
+                    163,
+                    83,
+                    135,
+                    70,
+                    224,
+                    96,
+                    31,
+                    80,
+                    196,
+                    80,
+                    63,
+                    22,
+                    85,
+                    100,
+                    47,
+                    283,
+                    0,
+                    163,
+                    126,
+                    143,
+                    195,
+                    82,
+                    53,
+                    82,
+                    18,
+                    27,
+                    182,
+                    52,
+                ],
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+
+        return input_ids, mask
+
+    def test_lsh_layer_forward(self):
+        config = self._get_basic_config_and_input()
+        config["lsh_num_chunks_before"] = 0
+        config["attn_layers"] = ["lsh"]
+        config["is_decoder"] = False
+        hidden_states = self._get_hidden_states()
+        torch.manual_seed(0)
+        layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
+        layer.eval()
+        reformer_output = layer(prev_attn_output=hidden_states.clone(), hidden_states=hidden_states)
+        output_slice = reformer_output.hidden_states[0, 0, :5]
+        expected_output_slice = torch.tensor(
+            [1.6879, -1.3083, -0.4708, 1.3555, -0.6292],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_lsh_layer_forward_complex(self):
+        config = self._get_basic_config_and_input()
+        config["lsh_num_chunks_before"] = 0
+        config["attn_layers"] = ["lsh"]
+        config["num_buckets"] = [2, 4]
+        attn_mask = self._get_attn_mask()
+        hidden_states = self._get_hidden_states()
+        torch.manual_seed(0)
+        layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
+        layer.eval()
+        reformer_output = layer(
+            prev_attn_output=hidden_states.clone(),
+            hidden_states=hidden_states,
+            attention_mask=attn_mask,
+        )
+        output_slice = reformer_output.hidden_states[0, 0, :5]
+        expected_output_slice = torch.tensor(
+            [1.6439, -1.2306, -0.5108, 1.3006, -0.6537],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_local_layer_forward(self):
+        config = self._get_basic_config_and_input()
+        config["local_num_chunks_before"] = 0
+        config["attn_layers"] = ["local"]
+        config["is_decoder"] = False
+        hidden_states = self._get_hidden_states()
+        torch.manual_seed(0)
+        layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
+        layer.eval()
+        reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states)
+        output_slice = reformer_output.hidden_states[0, 0, :5]
+        expected_output_slice = torch.tensor(
+            [1.4212, -2.0576, -0.9688, 1.4599, -0.1344],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_local_layer_forward_complex(self):
+        config = self._get_basic_config_and_input()
+        config["local_num_chunks_before"] = 0
+        config["attn_layers"] = ["local"]
+        attn_mask = self._get_attn_mask()
+        hidden_states = self._get_hidden_states()
+        torch.manual_seed(0)
+        layer = ReformerLayer(ReformerConfig(**config)).to(torch_device)
+        layer.eval()
+        reformer_output = layer(
+            prev_attn_output=hidden_states,
+            hidden_states=hidden_states,
+            attention_mask=attn_mask,
+        )
+        output_slice = reformer_output.hidden_states[0, 0, :5]
+        expected_output_slice = torch.tensor(
+            [1.4750, -2.0235, -0.9743, 1.4463, -0.1269],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_lsh_model_forward(self):
+        config = self._get_basic_config_and_input()
+        config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
+        config["num_buckets"] = [2, 4]
+        torch.manual_seed(0)
+        model = ReformerModel(ReformerConfig(**config)).to(torch_device)
+        model.eval()
+        input_ids, attn_mask = self._get_input_ids_and_mask()
+        hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
+        output_slice = hidden_states[0, 0, :5]
+        expected_output_slice = torch.tensor(
+            [-0.9896, -0.9396, -1.0831, -0.0597, 0.2456],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_local_model_forward(self):
+        config = self._get_basic_config_and_input()
+        config["attn_layers"] = ["local", "local", "local", "local"]
+        torch.manual_seed(0)
+        model = ReformerModel(ReformerConfig(**config)).to(torch_device)
+        model.eval()
+        input_ids, attn_mask = self._get_input_ids_and_mask()
+        hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
+        output_slice = hidden_states[0, 0, :5]
+        expected_output_slice = torch.tensor(
+            [-1.6791, 0.7171, 0.1594, 0.4063, 1.2584],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_lm_model_forward(self):
+        config = self._get_basic_config_and_input()
+        config["attn_layers"] = ["local", "lsh", "local", "lsh", "local", "lsh"]
+        config["num_buckets"] = [2, 4]
+        config["is_decoder"] = False
+        torch.manual_seed(0)
+        model = ReformerForMaskedLM(ReformerConfig(**config)).to(torch_device)
+        model.eval()
+        input_ids, attn_mask = self._get_input_ids_and_mask()
+        hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
+        output_slice = hidden_states[1, -1, :5]
+        expected_output_slice = torch.tensor(
+            [0.0256, -0.0121, 0.0636, 0.0024, -0.0393],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_local_lm_model_grad(self):
+        config = self._get_basic_config_and_input()
+        config["attn_layers"] = ["local", "local", "local", "local"]
+        config["hidden_dropout_prob"] = 0.0
+        config["local_attention_probs_dropout_prob"] = 0.0
+        torch.manual_seed(0)
+        model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
+        model.train()
+        model.zero_grad()
+        input_ids, _ = self._get_input_ids_and_mask()
+        loss = model(input_ids=input_ids, labels=input_ids)[0]
+
+        self.assertTrue(torch.allclose(loss, torch.tensor(5.7786, dtype=torch.float, device=torch_device), atol=1e-3))
+        loss.backward()
+
+        # check last grads to cover all proable errors
+        grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
+        expected_grad_slice_word = torch.tensor(
+            [-0.0005, 0.0001, 0.0002, 0.0003, 0.0006],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
+        expected_grad_slice_pos_fac_1 = torch.tensor(
+            [0.0037, -1.3793, -1.0231, -1.5230, -2.5306],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
+        expected_grad_slice_pos_fac_2 = torch.tensor(
+            [-1.3165, 0.5168, 0.7785, 1.0811, -0.9830],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
+        self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
+        self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
+
+    def test_lsh_lm_model_grad(self):
+        config = self._get_basic_config_and_input()
+        config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
+        config["hidden_dropout_prob"] = 0.0
+        config["lsh_attention_probs_dropout_prob"] = 0.0
+        config["num_buckets"] = [2, 4]
+        config["num_hashes"] = 6
+        torch.manual_seed(0)
+        model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device)
+        model.train()
+        model.zero_grad()
+        input_ids, _ = self._get_input_ids_and_mask()
+        loss = model(input_ids=input_ids, labels=input_ids)[0]
+
+        self.assertTrue(torch.allclose(loss, torch.tensor(5.7819, dtype=torch.float, device=torch_device), atol=1e-3))
+        loss.backward()
+        # check last grads to cover all proable errors
+        grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
+        expected_grad_slice_word = torch.tensor(
+            [2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
+        expected_grad_slice_pos_fac_1 = torch.tensor(
+            [-0.0984, 0.6283, 0.4282, 1.2960, 0.6897],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
+        expected_grad_slice_pos_fac_2 = torch.tensor(
+            [0.4626, -0.0231, -0.0172, 0.1081, 0.3805],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
+        self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
+        self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
+
+    @slow
+    def test_pretrained_generate_crime_and_punish(self):
+        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
+        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
+        model.eval()
+
+        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
+        output_ids = model.generate(
+            input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
+        )
+        output = tokenizer.decode(output_ids[0])
+
+        self.assertEqual(
+            output,
+            "A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
+        )
+
+    @slow
+    def test_pretrained_generate_use_cache_equality(self):
+        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
+        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
+        model.eval()
+        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
+        output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False)
+        output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True)
+
+        output_with_cache = tokenizer.decode(output_ids_with_cache[0])
+        output_without_cache = tokenizer.decode(output_ids_without_cache[0])
+
+        self.assertEqual(output_with_cache, output_without_cache)
diff --git a/test_modeling_roberta.py b/test_modeling_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..168e5073d7dbc1549b752d4014f42c6b11fa72f3
--- /dev/null
+++ b/test_modeling_roberta.py
@@ -0,0 +1,529 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        RobertaConfig,
+        RobertaForCausalLM,
+        RobertaForMaskedLM,
+        RobertaForMultipleChoice,
+        RobertaForQuestionAnswering,
+        RobertaForSequenceClassification,
+        RobertaForTokenClassification,
+        RobertaModel,
+    )
+    from transformers.models.roberta.modeling_roberta import (
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        RobertaEmbeddings,
+        create_position_ids_from_input_ids,
+    )
+
+
+class RobertaModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = RobertaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RobertaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = RobertaModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = RobertaForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = RobertaForCausalLM(config=config).to(torch_device).eval()
+
+        # make sure that ids don't start with pad token
+        mask = input_ids.ne(config.pad_token_id).long()
+        input_ids = input_ids * mask
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+
+        # make sure that ids don't start with pad token
+        mask = next_tokens.ne(config.pad_token_id).long()
+        next_tokens = next_tokens * mask
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RobertaForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = RobertaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = RobertaForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RobertaForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            RobertaForCausalLM,
+            RobertaForMaskedLM,
+            RobertaModel,
+            RobertaForSequenceClassification,
+            RobertaForTokenClassification,
+            RobertaForMultipleChoice,
+            RobertaForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (RobertaForCausalLM,) if is_torch_available() else ()
+    test_sequence_classification_problem_types = True
+
+    def setUp(self):
+        self.model_tester = RobertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RobertaModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_create_position_ids_respects_padding_index(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = RobertaEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor(
+            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
+        )
+
+        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = RobertaEmbeddings(config=config)
+
+        inputs_embeds = torch.empty(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(position_ids.shape, expected_positions.shape)
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+
+@require_torch
+class RobertaModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = RobertaForMaskedLM.from_pretrained("roberta-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+        # roberta.eval()
+        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = RobertaModel.from_pretrained("roberta-base")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+        # roberta.eval()
+        # expected_slice = roberta.extract_features(input_ids)[:, :3, :3].detach()
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
+
+        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
+
+        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
+        # roberta.eval()
+        # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
+
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/test_modeling_roformer.py b/test_modeling_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb39abbf931bec360c7132d6d381ea59e4a6d82
--- /dev/null
+++ b/test_modeling_roformer.py
@@ -0,0 +1,556 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch RoFormer model. """
+
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        RoFormerConfig,
+        RoFormerForCausalLM,
+        RoFormerForMaskedLM,
+        RoFormerForMultipleChoice,
+        RoFormerForQuestionAnswering,
+        RoFormerForSequenceClassification,
+        RoFormerForTokenClassification,
+        RoFormerModel,
+    )
+    from transformers.models.roformer.modeling_roformer import (
+        ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        RoFormerSelfAttention,
+        RoFormerSinusoidalPositionalEmbedding,
+    )
+
+
+class RoFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = RoFormerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RoFormerModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = RoFormerModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = RoFormerForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RoFormerForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = RoFormerForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = RoFormerForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = RoFormerForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = RoFormerForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = RoFormerForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class RoFormerModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            RoFormerModel,
+            RoFormerForMaskedLM,
+            RoFormerForCausalLM,
+            RoFormerForMultipleChoice,
+            RoFormerForQuestionAnswering,
+            RoFormerForSequenceClassification,
+            RoFormerForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (RoFormerForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = RoFormerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RoFormerConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RoFormerModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class RoFormerModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = RoFormerForMaskedLM.from_pretrained("junnyu/roformer_chinese_base")
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        vocab_size = 50000
+
+        expected_shape = torch.Size((1, 6, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[[-0.1205, -1.0265, 0.2922], [-1.5134, 0.1974, 0.1519], [-5.0135, -3.9003, -0.8404]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class RoFormerSinusoidalPositionalEmbeddingTest(unittest.TestCase):
+    tolerance = 1e-4
+
+    def test_basic(self):
+        input_ids = torch.tensor([[4, 10]], dtype=torch.long, device=torch_device)
+        emb1 = RoFormerSinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6).to(torch_device)
+        emb = emb1(input_ids.shape)
+        desired_weights = torch.tensor(
+            [[0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 1.0000], [0.8415, 0.0464, 0.0022, 0.5403, 0.9989, 1.0000]]
+        ).to(torch_device)
+        self.assertTrue(
+            torch.allclose(emb, desired_weights, atol=self.tolerance),
+            msg=f"\nexp:\n{desired_weights}\ngot:\n{emb[0]}\n",
+        )
+
+    def test_positional_emb_weights_against_roformer(self):
+
+        desired_weights = torch.tensor(
+            [
+                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+                [0.8415, 0.8219, 0.8020, 0.7819, 0.7617],
+                [0.9093, 0.9364, 0.9581, 0.9749, 0.9870],
+            ]
+        ).to(torch_device)
+        emb1 = RoFormerSinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512).to(torch_device)
+        weights = emb1.weight.data[:3, :5].to(torch_device)
+
+        self.assertTrue(
+            torch.allclose(weights, desired_weights, atol=self.tolerance),
+            msg=f"\nexp:\n{desired_weights}\ngot:\n{weights}\n",
+        )
+
+
+@require_torch
+class RoFormerSelfAttentionRotaryPositionEmbeddingTest(unittest.TestCase):
+    tolerance = 1e-4
+
+    def test_apply_rotary_position_embeddings(self):
+        # 2,12,16,64
+        query_layer = (
+            torch.arange(2 * 12 * 16 * 64, dtype=torch.float, device=torch_device).reshape(2, 12, 16, 64) / 100
+        ).to(torch_device)
+        key_layer = (
+            -torch.arange(2 * 12 * 16 * 64, dtype=torch.float, device=torch_device).reshape(2, 12, 16, 64) / 100
+        ).to(torch_device)
+        embed_positions = RoFormerSinusoidalPositionalEmbedding(num_positions=32, embedding_dim=64).to(torch_device)
+        sinusoidal_pos = embed_positions([2, 16, 768])[None, None, :, :]
+
+        query_layer, key_layer = RoFormerSelfAttention.apply_rotary_position_embeddings(
+            sinusoidal_pos, query_layer, key_layer
+        )
+
+        desired_query_layer = torch.tensor(
+            [
+                [0.0000, 0.0100, 0.0200, 0.0300, 0.0400, 0.0500, 0.0600, 0.0700],
+                [-0.2012, 0.8897, 0.0263, 0.9401, 0.2074, 0.9463, 0.3481, 0.9343],
+                [-1.7057, 0.6271, -1.2145, 1.3897, -0.6303, 1.7647, -0.1173, 1.8985],
+                [-2.1731, -1.6397, -2.7358, 0.2854, -2.1840, 1.7183, -1.3018, 2.4871],
+                [0.2717, -3.6173, -2.9206, -2.1988, -3.6638, 0.3858, -2.9155, 2.2980],
+                [3.9859, -2.1580, -0.7984, -4.4904, -4.1181, -2.0252, -4.4782, 1.1253],
+            ]
+        ).to(torch_device)
+        desired_key_layer = torch.tensor(
+            [
+                [0.0000, -0.0100, -0.0200, -0.0300, -0.0400, -0.0500, -0.0600, -0.0700],
+                [0.2012, -0.8897, -0.0263, -0.9401, -0.2074, -0.9463, -0.3481, -0.9343],
+                [1.7057, -0.6271, 1.2145, -1.3897, 0.6303, -1.7647, 0.1173, -1.8985],
+                [2.1731, 1.6397, 2.7358, -0.2854, 2.1840, -1.7183, 1.3018, -2.4871],
+                [-0.2717, 3.6173, 2.9206, 2.1988, 3.6638, -0.3858, 2.9155, -2.2980],
+                [-3.9859, 2.1580, 0.7984, 4.4904, 4.1181, 2.0252, 4.4782, -1.1253],
+            ]
+        ).to(torch_device)
+
+        self.assertTrue(
+            torch.allclose(query_layer[0, 0, :6, :8], desired_query_layer, atol=self.tolerance),
+            msg=f"\nexp:\n{desired_query_layer}\ngot:\n{query_layer}\n",
+        )
+        self.assertTrue(
+            torch.allclose(key_layer[0, 0, :6, :8], desired_key_layer, atol=self.tolerance),
+            msg=f"\nexp:\n{desired_key_layer}\ngot:\n{key_layer}\n",
+        )
diff --git a/test_modeling_speech_to_text.py b/test_modeling_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..102a33f4a38f4b3a8f833eade68fbd14380b4edb
--- /dev/null
+++ b/test_modeling_speech_to_text.py
@@ -0,0 +1,765 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Speech2Text model. """
+
+
+import copy
+import inspect
+import os
+import tempfile
+import unittest
+
+from transformers.file_utils import cached_property
+from transformers.testing_utils import (
+    is_torch_available,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    require_torchaudio,
+    slow,
+    torch_device,
+)
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Speech2TextConfig,
+        Speech2TextForConditionalGeneration,
+        Speech2TextModel,
+        Speech2TextProcessor,
+    )
+    from transformers.models.speech_to_text.modeling_speech_to_text import Speech2TextDecoder, Speech2TextEncoder
+
+
+def prepare_speech_to_text_inputs_dict(
+    config,
+    input_features,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_features.ne(0)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        # "input_ids": input_features,
+        "input_features": input_features,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class Speech2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        num_conv_layers=2,
+        conv_kernel_sizes=(5, 5),
+        conv_channels=32,
+        input_feat_per_channel=24,
+        input_channels=1,
+        hidden_act="relu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        max_source_positions=20,
+        max_target_positions=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.num_conv_layers = num_conv_layers
+        self.conv_kernel_sizes = conv_kernel_sizes
+        self.conv_channels = conv_channels
+        self.input_feat_per_channel = input_feat_per_channel
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor(
+            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
+        )
+        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
+
+        config = Speech2TextConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            num_conv_layers=self.num_conv_layers,
+            conv_kernel_sizes=self.conv_kernel_sizes,
+            conv_channels=self.conv_channels,
+            input_feat_per_channel=self.input_feat_per_channel,
+            input_channels=self.input_channels,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_source_positions=self.max_source_positions,
+            max_target_positions=self.max_target_positions,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_speech_to_text_inputs_dict(
+            config,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+        )
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_subsampled_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        for i in range(self.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = Speech2TextModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["decoder_input_ids"]
+        attention_mask = inputs_dict["decoder_attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = Speech2TextModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = Speech2TextEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(
+            inputs_dict["input_features"], attention_mask=inputs_dict["attention_mask"]
+        )[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = Speech2TextDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (Speech2TextModel, Speech2TextForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (Speech2TextForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_missing_keys = False
+    test_torchscript = True
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = Speech2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Speech2TextConfig)
+        self.maxDiff = 3000
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_inputs_embeds(self):
+        pass
+
+    # training is not supported yet
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_features = input_dict["input_features"]
+        attention_mask = input_dict["attention_mask"]
+        model = Speech2TextForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            input_features = input_features.half()
+            model.half()
+        model.generate(input_features, attention_mask=attention_mask)
+        model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_features",
+                "attention_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            subsampled_seq_length = model._get_subsampled_output_lengths(seq_length)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [subsampled_seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            subsampled_encoder_seq_length = model._get_subsampled_output_lengths(encoder_seq_length)
+            subsampled_encoder_key_length = model._get_subsampled_output_lengths(encoder_key_length)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 5
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    subsampled_encoder_key_length,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+
+    def test_resize_tokens_embeddings(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # make sure that decoder_input_ids are resized
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_embeddings_untied(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    def test_generate_without_input_ids(self):
+        pass
+
+    @staticmethod
+    def _get_encoder_outputs(
+        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+    ):
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
+            num_interleave, dim=0
+        )
+        input_ids = input_ids[:, :, 0]
+        input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + model._get_decoder_start_token_id()
+        attention_mask = None
+        return encoder_outputs, input_ids, attention_mask
+
+    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
+        batch_size, seq_length = input_ids.shape[:2]
+        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+        num_sequences_in_output = batch_size * num_return_sequences
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+
+        # Attentions
+        # encoder
+        self._check_encoder_attention_for_generate(
+            output.encoder_attentions, batch_size, config, subsampled_seq_length
+        )
+        # decoder
+        self._check_attentions_for_generate(
+            num_sequences_in_output,
+            output.decoder_attentions,
+            min_length=1,
+            max_length=output.sequences.shape[-1],
+            config=config,
+            use_cache=use_cache,
+        )
+
+        # Hidden States
+        # encoder
+        self._check_encoder_hidden_states_for_generate(
+            output.encoder_hidden_states, batch_size, config, subsampled_seq_length
+        )
+
+        # decoder
+        self._check_hidden_states_for_generate(
+            num_sequences_in_output,
+            output.decoder_hidden_states,
+            min_length=1,
+            max_length=output.sequences.shape[-1],
+            config=config,
+            use_cache=use_cache,
+        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            try:
+                model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                input_features = inputs["input_features"]
+                attention_mask = inputs["attention_mask"]
+                decoder_input_ids = inputs["decoder_input_ids"]
+                decoder_attention_mask = inputs["decoder_attention_mask"]
+                traced_model = torch.jit.trace(
+                    model, (input_features, attention_mask, decoder_input_ids, decoder_attention_mask)
+                )
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+
+@require_torch
+@require_torchaudio
+@require_sentencepiece
+@require_tokenizers
+@slow
+class Speech2TextModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        import soundfile as sf
+
+        # map files to raw
+        def map_to_array(batch):
+            speech, _ = sf.read(batch["file"])
+            batch["speech"] = speech
+            return batch
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        ds = ds.select(range(num_samples)).map(map_to_array)
+
+        return ds["speech"][:num_samples]
+
+    def test_generation_librispeech(self):
+        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(1)
+
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        generated_ids = model.generate(input_features)
+        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
+
+    def test_generation_librispeech_batched(self):
+        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_features = inputs.input_features.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        generated_ids = model.generate(input_features, attention_mask=attention_mask)
+        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the titleing cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant of panic was followed by a small sharp blow high on his chest",
+        ]
+
+        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
diff --git a/test_modeling_squeezebert.py b/test_modeling_squeezebert.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f9d65fa9ac2e1c782368f18ec3d9f8c0e80ff99
--- /dev/null
+++ b/test_modeling_squeezebert.py
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        SqueezeBertConfig,
+        SqueezeBertForMaskedLM,
+        SqueezeBertForMultipleChoice,
+        SqueezeBertForQuestionAnswering,
+        SqueezeBertForSequenceClassification,
+        SqueezeBertForTokenClassification,
+        SqueezeBertModel,
+    )
+
+    class SqueezeBertModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=False,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=64,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+            q_groups=2,
+            k_groups=2,
+            v_groups=2,
+            post_attention_groups=2,
+            intermediate_groups=4,
+            output_groups=1,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+            self.q_groups = q_groups
+            self.k_groups = k_groups
+            self.v_groups = v_groups
+            self.post_attention_groups = post_attention_groups
+            self.intermediate_groups = intermediate_groups
+            self.output_groups = output_groups
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = SqueezeBertConfig(
+                embedding_size=self.hidden_size,
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                attention_probs_dropout_prob=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range,
+                q_groups=self.q_groups,
+                k_groups=self.k_groups,
+                v_groups=self.v_groups,
+                post_attention_groups=self.post_attention_groups,
+                intermediate_groups=self.intermediate_groups,
+                output_groups=self.output_groups,
+            )
+
+            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_squeezebert_model(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = SqueezeBertModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, input_mask)
+            result = model(input_ids)
+            self.parent.assertEqual(
+                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
+            )
+
+        def create_and_check_squeezebert_for_masked_lm(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = SqueezeBertForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+        def create_and_check_squeezebert_for_question_answering(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = SqueezeBertForQuestionAnswering(config=config)
+            model.to(torch_device)
+            model.eval()
+            result = model(
+                input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels
+            )
+            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+        def create_and_check_squeezebert_for_sequence_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = SqueezeBertForSequenceClassification(config)
+            model.to(torch_device)
+            model.eval()
+            result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+        def create_and_check_squeezebert_for_token_classification(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = SqueezeBertForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+        def create_and_check_squeezebert_for_multiple_choice(
+            self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_choices = self.num_choices
+            model = SqueezeBertForMultipleChoice(config=config)
+            model.to(torch_device)
+            model.eval()
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            result = model(
+                multiple_choice_inputs_ids,
+                attention_mask=multiple_choice_input_mask,
+                labels=choice_labels,
+            )
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+
+@require_torch
+class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            SqueezeBertModel,
+            SqueezeBertForMaskedLM,
+            SqueezeBertForMultipleChoice,
+            SqueezeBertForQuestionAnswering,
+            SqueezeBertForSequenceClassification,
+            SqueezeBertForTokenClassification,
+        )
+        if is_torch_available()
+        else None
+    )
+    test_pruning = False
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_head_masking = False
+    test_sequence_classification_problem_types = True
+
+    def setUp(self):
+        self.model_tester = SqueezeBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SqueezeBertConfig, dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_squeezebert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_squeezebert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_squeezebert_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_squeezebert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_squeezebert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_squeezebert_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_squeezebert_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = SqueezeBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_torch
+class SqueezeBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_classification_head(self):
+        model = SqueezeBertForSequenceClassification.from_pretrained("squeezebert/squeezebert-mnli")
+
+        input_ids = torch.tensor([[1, 29414, 232, 328, 740, 1140, 12695, 69, 13, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(output.shape, expected_shape)
+        expected_tensor = torch.tensor([[0.6401, -0.0349, -0.6041]])
+        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
diff --git a/test_modeling_t5.py b/test_modeling_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8fe6717aba85763f6ab648053be6494514c78ee
--- /dev/null
+++ b/test_modeling_t5.py
@@ -0,0 +1,1013 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import ByT5Tokenizer, T5Config, T5EncoderModel, T5ForConditionalGeneration, T5Model, T5Tokenizer
+    from transformers.models.t5.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class T5ModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        decoder_seq_length=9,
+        # For common tests
+        is_training=True,
+        use_attention_mask=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        eos_token_id=1,
+        pad_token_id=0,
+        decoder_start_token_id=0,
+        scope=None,
+        decoder_layers=None,
+    ):
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        self.decoder_seq_length = decoder_seq_length
+        # For common tests
+        self.seq_length = self.decoder_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.scope = None
+        self.decoder_layers = decoder_layers
+
+    def get_large_model_config(self):
+        return T5Config.from_pretrained("t5-base")
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        decoder_attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_decoder_layers=self.decoder_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def check_prepare_lm_labels_via_shift_left(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure that lm_labels are correctly padded from the right
+        lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+        # add casaul pad token mask
+        triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+        lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+        decoder_input_ids = model._shift_right(lm_labels)
+
+        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+            # first item
+            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+            if i < decoder_input_ids_slice.shape[-1]:
+                if i < decoder_input_ids.shape[-1] - 1:
+                    # items before diagonal
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                    )
+                # pad items after diagonal
+                if i < decoder_input_ids.shape[-1] - 2:
+                    self.parent.assertListEqual(
+                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                    )
+            else:
+                # all items after square
+                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+            decoder_attention_mask=decoder_attention_mask,
+        )
+        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_with_lm_head(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        self.parent.assertEqual(len(outputs), 4)
+        self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
+        self.parent.assertEqual(outputs["loss"].size(), ())
+
+    def create_and_check_decoder_model_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder().to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+        output_from_no_past = model(next_input_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_decoder_model_attention_mask_past(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder()
+        model.to(torch_device)
+        model.eval()
+
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+
+        half_seq_length = input_ids.shape[-1] // 2
+        attn_mask[:, half_seq_length:] = 0
+
+        # first forward pass
+        output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(config=config).get_decoder().to(torch_device).eval()
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_generate_with_past_key_values(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
+        torch.manual_seed(0)
+        output_without_past_cache = model.generate(
+            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
+        )
+        torch.manual_seed(0)
+        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
+        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        model = T5Model(config=config).to(torch_device).half().eval()
+        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def create_and_check_encoder_decoder_shared_weights(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        attention_mask,
+        decoder_attention_mask,
+        lm_labels,
+    ):
+        for model_class in [T5Model, T5ForConditionalGeneration]:
+            torch.manual_seed(0)
+            model = model_class(config=config).to(torch_device).eval()
+            # load state dict copies weights but does not tie them
+            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
+
+            torch.manual_seed(0)
+            tied_config = copy.deepcopy(config)
+            tied_config.tie_encoder_decoder = True
+            tied_model = model_class(config=tied_config).to(torch_device).eval()
+
+            model_result = model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            tied_model_result = tied_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # check that models has less parameters
+            self.parent.assertLess(
+                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+            )
+            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+            # check that outputs are equal
+            self.parent.assertTrue(
+                torch.allclose(
+                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
+                )
+            )
+
+            # check that outputs after saving and loading are equal
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                tied_model.save_pretrained(tmpdirname)
+                tied_model = model_class.from_pretrained(tmpdirname)
+                tied_model.to(torch_device)
+                tied_model.eval()
+
+                # check that models has less parameters
+                self.parent.assertLess(
+                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
+                )
+                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
+
+                tied_model_result = tied_model(
+                    input_ids=input_ids,
+                    decoder_input_ids=decoder_input_ids,
+                    attention_mask=attention_mask,
+                    decoder_attention_mask=decoder_attention_mask,
+                )
+
+                # check that outputs are equal
+                self.parent.assertTrue(
+                    torch.allclose(
+                        model_result[0][0, :, random_slice_idx],
+                        tied_model_result[0][0, :, random_slice_idx],
+                        atol=1e-4,
+                    )
+                )
+
+    def check_resize_embeddings_t5_v1_1(
+        self,
+        config,
+    ):
+        prev_vocab_size = config.vocab_size
+
+        config.tie_word_embeddings = False
+        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
+        model.resize_token_embeddings(prev_vocab_size - 10)
+
+        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
+        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
+        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
+    fx_ready_model_classes = all_model_classes
+    all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_model_parallel = True
+    is_encoder_decoder = True
+
+    def setUp(self):
+        self.model_tester = T5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_shift_right(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_v1_1(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        # check that gated gelu feed forward and different word embeddings work
+        config = config_and_inputs[0]
+        config.tie_word_embeddings = False
+        config.feed_forward_proj = "gated-gelu"
+        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
+
+    def test_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
+
+    def test_decoder_model_past_with_attn_mask(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_decoder_model_past_with_3d_attn_mask(self):
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = self.model_tester.prepare_config_and_inputs()
+
+        attention_mask = ids_tensor(
+            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
+            vocab_size=2,
+        )
+        decoder_attention_mask = ids_tensor(
+            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
+            vocab_size=2,
+        )
+
+        self.model_tester.create_and_check_decoder_model_attention_mask_past(
+            config,
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        )
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_generate_with_past_key_values(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
+
+    def test_encoder_decoder_shared_weights(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+    def test_v1_1_resize_embeddings(self):
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        self.model_tester.check_resize_embeddings_t5_v1_1(config)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = T5Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip("Test has a segmentation fault on torch 1.8.0")
+    def test_export_to_onnx(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        model = T5Model(config_and_inputs[0]).to(torch_device)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.onnx.export(
+                model,
+                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
+                f"{tmpdirname}/t5_test.onnx",
+                export_params=True,
+                opset_version=9,
+                input_names=["input_ids", "decoder_input_ids"],
+            )
+
+    def test_generate_with_head_masking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        max_length = config_and_inputs[1].shape[-1] + 3
+        model = T5ForConditionalGeneration(config).eval()
+        model.to(torch_device)
+
+        head_masking = {
+            "head_mask": torch.zeros(config.num_layers, config.num_heads, device=torch_device),
+            "decoder_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+            "cross_attn_head_mask": torch.zeros(config.num_decoder_layers, config.num_heads, device=torch_device),
+        }
+
+        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+            head_masks = {name: mask}
+            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
+            if name == "head_mask":
+                head_masks["decoder_head_mask"] = torch.ones(
+                    config.num_decoder_layers, config.num_heads, device=torch_device
+                )
+
+            out = model.generate(
+                config_and_inputs[1],
+                num_beams=1,
+                max_length=max_length,
+                output_attentions=True,
+                return_dict_in_generate=True,
+                **head_masks,
+            )
+            # We check the state of decoder_attentions and cross_attentions just from the last step
+            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
+
+
+class T5EncoderOnlyModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        # For common tests
+        use_attention_mask=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        is_training=False,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        is_encoder_decoder=False,
+        eos_token_id=1,
+        pad_token_id=0,
+        scope=None,
+    ):
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        # For common tests
+        self.seq_length = self.encoder_seq_length
+        self.use_attention_mask = use_attention_mask
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.is_encoder_decoder = is_encoder_decoder
+        self.scope = None
+        self.is_training = is_training
+
+    def get_large_model_config(self):
+        return T5Config.from_pretrained("t5-base")
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+    ):
+        model = T5EncoderModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        result = model(input_ids=input_ids)
+        encoder_output = result.last_hidden_state
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
+
+    def create_and_check_model_fp16_forward(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+    ):
+        model = T5EncoderModel(config=config).to(torch_device).half().eval()
+        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
+        self.parent.assertFalse(torch.isnan(output).any().item())
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+class T5EncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (T5EncoderModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = True
+    test_resize_embeddings = False
+    test_model_parallel = True
+    all_parallelizable_model_classes = (T5EncoderModel,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = T5EncoderOnlyModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_model_fp16_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
+
+
+def use_task_specific_params(model, task):
+    model.config.update(model.config.task_specific_params[task])
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class T5ModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return T5ForConditionalGeneration.from_pretrained("t5-base").to(torch_device)
+
+    @cached_property
+    def tokenizer(self):
+        return T5Tokenizer.from_pretrained("t5-base")
+
+    @slow
+    def test_small_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+
+        >>> path_to_mtf_small_t5_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = T5ForConditionalGeneration.from_pretrained("t5-small").to(torch_device)
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
+        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
+
+        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
+        mtf_score = -(labels.shape[-1] * loss.item())
+
+        EXPECTED_SCORE = -19.0845
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_small_v1_1_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+
+        >>> path_to_mtf_small_t5_v1_1_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_v1_1_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small").to(torch_device)
+        tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
+        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
+
+        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
+        mtf_score = -(labels.shape[-1] * loss.item())
+
+        EXPECTED_SCORE = -59.0293
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_small_byt5_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.9.1
+
+        >>> path_to_byt5_small_checkpoint = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_tf_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = t5.data.ByteVocabulary()
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = T5ForConditionalGeneration.from_pretrained("google/byt5-small").to(torch_device)
+        tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="pt").input_ids
+        labels = tokenizer("Hi I am", return_tensors="pt").input_ids
+
+        loss = model(input_ids.to(torch_device), labels=labels.to(torch_device)).loss
+        mtf_score = -(labels.shape[-1] * loss.item())
+
+        EXPECTED_SCORE = -60.7397
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_summarization(self):
+        model = self.model
+        tok = self.tokenizer
+
+        FRANCE_ARTICLE = 'Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
+        SHORTER_ARTICLE = '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        IRAN_ARTICLE = "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+        ARTICLE_SUBWAY = 'New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+
+        expected_summaries = [
+            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a cell phone video of the final seconds . "one can hear cries of \'My God\' in several languages," one magazine says .',
+            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a preliminary examination into the situation in the occupied Palestinian territory . as members of the court, Palestinians may be subject to counter-charges as well .",
+            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller: the debate that has already begun since the announcement of the new framework will likely result in more heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and implement a rigorous inspection regime .",
+            'prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two criminal counts of "offering a false instrument for filing in the first degree" she has been married 10 times, with nine of her marriages occurring between 1999 and 2002 .',
+        ]
+
+        use_task_specific_params(model, "summarization")
+
+        dct = tok(
+            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).to(torch_device)
+        self.assertEqual(512, dct["input_ids"].shape[1])
+
+        hypotheses_batch = model.generate(
+            **dct,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=142,
+            min_length=56,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+
+        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertListEqual(
+            expected_summaries,
+            decoded,
+        )
+
+    @slow
+    def test_translation_en_to_de(self):
+        model = self.model
+        tok = self.tokenizer
+        use_task_specific_params(model, "translation_en_to_de")
+
+        en_text = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
+        expected_translation = (
+            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
+        )
+
+        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="pt")
+        input_ids = input_ids.to(torch_device)
+        output = model.generate(input_ids)
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_fr(self):
+        model = self.model  # t5-base
+        tok = self.tokenizer
+        use_task_specific_params(model, "translation_en_to_fr")
+
+        en_text = ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots. '
+
+        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="pt")
+        input_ids = input_ids.to(torch_device)
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=100,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        new_truncated_translation = (
+            "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
+            "un "
+            "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
+            "sous forme "
+            "de points bleus."
+        )
+
+        self.assertEqual(translation, new_truncated_translation)
+
+    @slow
+    def test_translation_en_to_ro(self):
+        model = self.model
+        tok = self.tokenizer
+        use_task_specific_params(model, "translation_en_to_ro")
+        en_text = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
+        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
+
+        inputs = tok(model.config.prefix + en_text, return_tensors="pt").to(torch_device)
+        output = model.generate(**inputs)
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertEqual(translation, expected_translation)
+
+
+@require_torch
+class TestAsymmetricT5(unittest.TestCase):
+    def build_model_and_check_forward_pass(self, **kwargs):
+        tester = T5ModelTester(self, **kwargs)
+        config, *inputs = tester.prepare_config_and_inputs()
+        (
+            input_ids,
+            decoder_input_ids,
+            attention_mask,
+            decoder_attention_mask,
+            lm_labels,
+        ) = inputs
+        model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
+        outputs = model(
+            input_ids=input_ids,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            labels=lm_labels,
+        )
+        # outputs = model(*inputs)
+        assert len(outputs) == 4
+        assert outputs["logits"].size() == (tester.batch_size, tester.decoder_seq_length, tester.vocab_size)
+        assert outputs["loss"].size() == ()
+        return model
+
+    def test_small_decoder(self):
+        # num_hidden_layers is passed to T5Config as num_layers
+        model = self.build_model_and_check_forward_pass(decoder_layers=1, num_hidden_layers=2)
+        assert len(model.encoder.block) == 2
+        assert len(model.decoder.block) == 1
+
+    def test_defaulting_to_symmetry(self):
+        # num_hidden_layers is passed to T5Config as num_layers
+        model = self.build_model_and_check_forward_pass(num_hidden_layers=2)
+        assert len(model.decoder.block) == len(model.encoder.block) == 2
diff --git a/test_modeling_tapas.py b/test_modeling_tapas.py
new file mode 100644
index 0000000000000000000000000000000000000000..40bdba0e7079afd64b11b3d3a0359c93590514c4
--- /dev/null
+++ b/test_modeling_tapas.py
@@ -0,0 +1,1082 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    MODEL_FOR_MASKED_LM_MAPPING,
+    MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+    MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+    is_torch_available,
+)
+from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_scatter, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        TapasConfig,
+        TapasForMaskedLM,
+        TapasForQuestionAnswering,
+        TapasForSequenceClassification,
+        TapasModel,
+        TapasTokenizer,
+    )
+    from transformers.models.tapas.modeling_tapas import (
+        IndexMap,
+        ProductIndexMap,
+        flatten,
+        gather,
+        range_index_map,
+        reduce_max,
+        reduce_mean,
+        reduce_sum,
+    )
+
+
+class TapasModelTester:
+    """You can also import this e.g from .test_modeling_tapas import TapasModelTester"""
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        max_position_embeddings=512,
+        type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10],
+        type_sequence_label_size=2,
+        positive_weight=10.0,
+        num_aggregation_labels=4,
+        num_labels=2,
+        aggregation_loss_importance=0.8,
+        use_answer_as_supervision=True,
+        answer_loss_importance=0.001,
+        use_normalized_answer_loss=False,
+        huber_loss_delta=25.0,
+        temperature=1.0,
+        agg_temperature=1.0,
+        use_gumbel_for_cells=False,
+        use_gumbel_for_agg=False,
+        average_approximation_function="ratio",
+        cell_selection_preference=0.5,
+        answer_loss_cutoff=100,
+        max_num_rows=64,
+        max_num_columns=32,
+        average_logits_per_cell=True,
+        select_one_column=True,
+        allow_empty_column_selection=False,
+        init_cell_selection_weights_to_zero=False,
+        reset_position_index_per_cell=True,
+        disable_per_token_loss=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_sizes = type_vocab_sizes
+        self.type_sequence_label_size = type_sequence_label_size
+        self.positive_weight = positive_weight
+        self.num_aggregation_labels = num_aggregation_labels
+        self.num_labels = num_labels
+        self.aggregation_loss_importance = aggregation_loss_importance
+        self.use_answer_as_supervision = use_answer_as_supervision
+        self.answer_loss_importance = answer_loss_importance
+        self.use_normalized_answer_loss = use_normalized_answer_loss
+        self.huber_loss_delta = huber_loss_delta
+        self.temperature = temperature
+        self.agg_temperature = agg_temperature
+        self.use_gumbel_for_cells = use_gumbel_for_cells
+        self.use_gumbel_for_agg = use_gumbel_for_agg
+        self.average_approximation_function = average_approximation_function
+        self.cell_selection_preference = cell_selection_preference
+        self.answer_loss_cutoff = answer_loss_cutoff
+        self.max_num_rows = max_num_rows
+        self.max_num_columns = max_num_columns
+        self.average_logits_per_cell = average_logits_per_cell
+        self.select_one_column = select_one_column
+        self.allow_empty_column_selection = allow_empty_column_selection
+        self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero
+        self.reset_position_index_per_cell = reset_position_index_per_cell
+        self.disable_per_token_loss = disable_per_token_loss
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(torch_device)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length]).to(torch_device)
+
+        token_type_ids = []
+        for type_vocab_size in self.type_vocab_sizes:
+            token_type_ids.append(ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=type_vocab_size))
+        token_type_ids = torch.stack(token_type_ids, dim=2).to(torch_device)
+
+        sequence_labels = None
+        token_labels = None
+        labels = None
+        numeric_values = None
+        numeric_values_scale = None
+        float_answer = None
+        aggregation_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(torch_device)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(torch_device)
+            labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(torch_device)
+            numeric_values = floats_tensor([self.batch_size, self.seq_length]).to(torch_device)
+            numeric_values_scale = floats_tensor([self.batch_size, self.seq_length]).to(torch_device)
+            float_answer = floats_tensor([self.batch_size]).to(torch_device)
+            aggregation_labels = ids_tensor([self.batch_size], self.num_aggregation_labels).to(torch_device)
+
+        config = TapasConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_sizes=self.type_vocab_sizes,
+            initializer_range=self.initializer_range,
+            positive_weight=self.positive_weight,
+            num_aggregation_labels=self.num_aggregation_labels,
+            num_labels=self.num_labels,
+            aggregation_loss_importance=self.aggregation_loss_importance,
+            use_answer_as_supervision=self.use_answer_as_supervision,
+            answer_loss_importance=self.answer_loss_importance,
+            use_normalized_answer_loss=self.use_normalized_answer_loss,
+            huber_loss_delta=self.huber_loss_delta,
+            temperature=self.temperature,
+            agg_temperature=self.agg_temperature,
+            use_gumbel_for_cells=self.use_gumbel_for_cells,
+            use_gumbel_for_agg=self.use_gumbel_for_agg,
+            average_approximation_function=self.average_approximation_function,
+            cell_selection_preference=self.cell_selection_preference,
+            answer_loss_cutoff=self.answer_loss_cutoff,
+            max_num_rows=self.max_num_rows,
+            max_num_columns=self.max_num_columns,
+            average_logits_per_cell=self.average_logits_per_cell,
+            select_one_column=self.select_one_column,
+            allow_empty_column_selection=self.allow_empty_column_selection,
+            init_cell_selection_weights_to_zero=self.init_cell_selection_weights_to_zero,
+            reset_position_index_per_cell=self.reset_position_index_per_cell,
+            disable_per_token_loss=self.disable_per_token_loss,
+        )
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            labels,
+            numeric_values,
+            numeric_values_scale,
+            float_answer,
+            aggregation_labels,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_type_ids,
+        sequence_labels,
+        token_labels,
+        labels,
+        numeric_values,
+        numeric_values_scale,
+        float_answer,
+        aggregation_labels,
+    ):
+        model = TapasModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_type_ids,
+        sequence_labels,
+        token_labels,
+        labels,
+        numeric_values,
+        numeric_values_scale,
+        float_answer,
+        aggregation_labels,
+    ):
+        model = TapasForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_type_ids,
+        sequence_labels,
+        token_labels,
+        labels,
+        numeric_values,
+        numeric_values_scale,
+        float_answer,
+        aggregation_labels,
+    ):
+        # inference: without aggregation head (SQA). Model only returns logits
+        sqa_config = copy.copy(config)
+        sqa_config.num_aggregation_labels = 0
+        sqa_config.use_answer_as_supervision = False
+        model = TapasForQuestionAnswering(config=sqa_config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+        # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits
+        model = TapasForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
+
+        # training: can happen in 3 main ways
+        # case 1: conversational (SQA)
+        model = TapasForQuestionAnswering(config=sqa_config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=labels,
+        )
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+        # case 2: weak supervision for aggregation (WTQ)
+        model = TapasForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids=input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=labels,
+            numeric_values=numeric_values,
+            numeric_values_scale=numeric_values_scale,
+            float_answer=float_answer,
+        )
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
+
+        # case 3: strong supervision for aggregation (WikiSQL-supervised)
+        wikisql_config = copy.copy(config)
+        wikisql_config.use_answer_as_supervision = False
+        model = TapasForQuestionAnswering(config=wikisql_config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=labels,
+            aggregation_labels=aggregation_labels,
+        )
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
+
+    def create_and_check_for_sequence_classification(
+        self,
+        config,
+        input_ids,
+        input_mask,
+        token_type_ids,
+        sequence_labels,
+        token_labels,
+        labels,
+        numeric_values,
+        numeric_values_scale,
+        float_answer,
+        aggregation_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = TapasForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            token_type_ids,
+            sequence_labels,
+            token_labels,
+            labels,
+            numeric_values,
+            numeric_values_scale,
+            float_answer,
+            aggregation_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+@require_scatter
+class TapasModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TapasModel,
+            TapasForMaskedLM,
+            TapasForQuestionAnswering,
+            TapasForSequenceClassification,
+        )
+        if is_torch_available()
+        else None
+    )
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = True
+    test_head_masking = False
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+            inputs_dict = {
+                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
+                if isinstance(v, torch.Tensor) and v.ndim > 1
+                else v
+                for k, v in inputs_dict.items()
+            }
+
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
+            elif model_class in get_values(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["aggregation_labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["numeric_values"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+                inputs_dict["numeric_values_scale"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+                inputs_dict["float_answer"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.float, device=torch_device
+                )
+            elif model_class in [
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class in [
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = TapasModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TapasConfig, dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+
+def prepare_tapas_single_inputs_for_inference():
+    # Here we prepare a single table-question pair to test TAPAS inference on:
+    data = {
+        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
+        "Age": ["33", "35"],
+    }
+    queries = "Which footballer is 33 years old?"
+    table = pd.DataFrame.from_dict(data)
+
+    return table, queries
+
+
+def prepare_tapas_batch_inputs_for_inference():
+    # Here we prepare a batch of 2 table-question pairs to test TAPAS inference on:
+    data = {
+        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
+        "Age": ["33", "35"],
+        "Number of goals": ["712", "750"],
+    }
+    queries = ["Which footballer is 33 years old?", "How many goals does Ronaldo have?"]
+    table = pd.DataFrame.from_dict(data)
+
+    return table, queries
+
+
+def prepare_tapas_batch_inputs_for_training():
+    # Here we prepare a DIFFERENT batch of 2 table-question pairs to test TAPAS training on:
+    data = {
+        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
+        "Age": ["33", "35"],
+        "Number of goals": ["712", "750"],
+    }
+    queries = ["Which footballer is 33 years old?", "What's the total number of goals?"]
+    table = pd.DataFrame.from_dict(data)
+
+    answer_coordinates = [[(0, 0)], [(0, 2), (1, 2)]]
+    answer_text = [["Lionel Messi"], ["1462"]]
+    float_answer = [float("NaN"), float("1462")]
+
+    return table, queries, answer_coordinates, answer_text, float_answer
+
+
+@require_torch
+@require_scatter
+class TapasModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_tokenizer(self):
+        return TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
+
+    @slow
+    def test_inference_no_head(self):
+        # ideally we want to test this with the weights of tapas_inter_masklm_base_reset,
+        # but since it's not straightforward to do this with the TF 1 implementation, we test it with
+        # the weights of the WTQ base model (i.e. tapas_wtq_wikisql_sqa_inter_masklm_base_reset)
+        model = TapasModel.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device)
+
+        tokenizer = self.default_tokenizer
+        table, queries = prepare_tapas_single_inputs_for_inference()
+        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
+        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
+        outputs = model(**inputs)
+        # test the sequence output
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-0.141581565, -0.599805772, 0.747186482],
+                    [-0.143664181, -0.602008104, 0.749218345],
+                    [-0.15169853, -0.603363097, 0.741370678],
+                ]
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[:, :3, :3], expected_slice, atol=0.0005))
+
+        # test the pooled output
+        expected_slice = torch.tensor([[0.987518311, -0.970520139, -0.994303405]], device=torch_device)
+
+        self.assertTrue(torch.allclose(outputs.pooler_output[:, :3], expected_slice, atol=0.0005))
+
+    @unittest.skip(reason="Model not available yet")
+    def test_inference_masked_lm(self):
+        pass
+
+    # TapasForQuestionAnswering has 3 possible ways of being fine-tuned:
+    # - conversational set-up (SQA)
+    # - weak supervision for aggregation (WTQ, WikiSQL)
+    # - strong supervision for aggregation (WikiSQL-supervised)
+    # We test all of them:
+    @slow
+    def test_inference_question_answering_head_conversational(self):
+        # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset
+        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa").to(torch_device)
+
+        tokenizer = self.default_tokenizer
+        table, queries = prepare_tapas_single_inputs_for_inference()
+        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
+        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
+        outputs = model(**inputs)
+        # test the logits
+        logits = outputs.logits
+        expected_shape = torch.Size((1, 21))
+        self.assertEqual(logits.shape, expected_shape)
+
+        expected_tensor = torch.tensor(
+            [
+                [
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -9997.22461,
+                    -16.2628059,
+                    -10004.082,
+                    15.4330549,
+                    15.4330549,
+                    15.4330549,
+                    -9990.42,
+                    -16.3270779,
+                    -16.3270779,
+                    -16.3270779,
+                    -16.3270779,
+                    -16.3270779,
+                    -10004.8506,
+                ]
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits, expected_tensor, atol=0.015))
+
+    @slow
+    def test_inference_question_answering_head_conversational_absolute_embeddings(self):
+        # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset
+        # however here we test the version with absolute position embeddings
+        model = TapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa", revision="no_reset").to(
+            torch_device
+        )
+
+        tokenizer = self.default_tokenizer
+        table, queries = prepare_tapas_single_inputs_for_inference()
+        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
+        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
+        outputs = model(**inputs)
+        # test the logits
+        logits = outputs.logits
+        expected_shape = torch.Size((1, 21))
+        self.assertEqual(logits.shape, expected_shape)
+
+        expected_tensor = torch.tensor(
+            [
+                [
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -10014.7793,
+                    -18.8419304,
+                    -10018.0391,
+                    17.7848816,
+                    17.7848816,
+                    17.7848816,
+                    -9981.02832,
+                    -16.4005489,
+                    -16.4005489,
+                    -16.4005489,
+                    -16.4005489,
+                    -16.4005489,
+                    -10013.4736,
+                ]
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits, expected_tensor, atol=0.01))
+
+    @slow
+    def test_inference_question_answering_head_weak_supervision(self):
+        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
+        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device)
+
+        tokenizer = self.default_tokenizer
+        # let's test on a batch
+        table, queries = prepare_tapas_batch_inputs_for_inference()
+        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="pt")
+        inputs_on_device = {k: v.to(torch_device) for k, v in inputs.items()}
+
+        outputs = model(**inputs_on_device)
+        # test the logits
+        logits = outputs.logits
+        expected_shape = torch.Size((2, 28))
+        self.assertEqual(logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [
+                [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736],
+                [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677],
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits[:, -6:], expected_slice, atol=0.4))
+
+        # test the aggregation logits
+        logits_aggregation = outputs.logits_aggregation
+        expected_shape = torch.Size((2, 4))
+        self.assertEqual(logits_aggregation.shape, expected_shape)
+        expected_tensor = torch.tensor(
+            [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=0.001))
+
+        # test the predicted answer coordinates and aggregation indices
+        EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]]
+        EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1]
+
+        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+            inputs, outputs.logits.detach().cpu(), outputs.logits_aggregation.detach().cpu()
+        )
+
+        self.assertEqual(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates)
+        self.assertEqual(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
+
+    @slow
+    def test_training_question_answering_head_weak_supervision(self):
+        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
+        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device)
+        model.to(torch_device)
+        # normally we should put the model in training mode but it's a pain to do this with the TF 1 implementation
+
+        tokenizer = self.default_tokenizer
+        # let's test on a batch
+        table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training()
+        inputs = tokenizer(
+            table=table,
+            queries=queries,
+            answer_coordinates=answer_coordinates,
+            answer_text=answer_text,
+            padding="longest",
+            return_tensors="pt",
+        )
+
+        # prepare data (created by the tokenizer) and move to torch_device
+        input_ids = inputs["input_ids"].to(torch_device)
+        attention_mask = inputs["attention_mask"].to(torch_device)
+        token_type_ids = inputs["token_type_ids"].to(torch_device)
+        labels = inputs["labels"].to(torch_device)
+        numeric_values = inputs["numeric_values"].to(torch_device)
+        numeric_values_scale = inputs["numeric_values_scale"].to(torch_device)
+
+        # the answer should be prepared by the user
+        float_answer = torch.FloatTensor(float_answer).to(torch_device)
+
+        # forward pass to get loss + logits:
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=labels,
+            numeric_values=numeric_values,
+            numeric_values_scale=numeric_values_scale,
+            float_answer=float_answer,
+        )
+
+        # test the loss
+        loss = outputs.loss
+        expected_loss = torch.tensor(3.3527612686157227e-08, device=torch_device)
+        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-6))
+
+        # test the logits on the first example
+        logits = outputs.logits
+        expected_shape = torch.Size((2, 29))
+        self.assertEqual(logits.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [
+                -160.0156,
+                -160.0156,
+                -160.0156,
+                -160.0156,
+                -160.0156,
+                -10072.2266,
+                -10070.8896,
+                -10092.6006,
+                -10092.6006,
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits[0, -9:], expected_slice, atol=1e-6))
+
+        # test the aggregation logits on the second example
+        logits_aggregation = outputs.logits_aggregation
+        expected_shape = torch.Size((2, 4))
+        self.assertEqual(logits_aggregation.shape, expected_shape)
+        expected_slice = torch.tensor([-4.0538, 40.0304, -5.3554, 23.3965], device=torch_device)
+
+        self.assertTrue(torch.allclose(logits_aggregation[1, -4:], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_question_answering_head_strong_supervision(self):
+        # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset
+        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised").to(
+            torch_device
+        )
+
+        tokenizer = self.default_tokenizer
+        table, queries = prepare_tapas_single_inputs_for_inference()
+        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
+        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
+        outputs = model(**inputs)
+        # test the logits
+        logits = outputs.logits
+        expected_shape = torch.Size((1, 21))
+        self.assertEqual(logits.shape, expected_shape)
+        expected_tensor = torch.tensor(
+            [
+                [
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -10011.1084,
+                    -18.6185989,
+                    -10008.7969,
+                    17.6355762,
+                    17.6355762,
+                    17.6355762,
+                    -10002.4404,
+                    -18.7111301,
+                    -18.7111301,
+                    -18.7111301,
+                    -18.7111301,
+                    -18.7111301,
+                    -10007.0977,
+                ]
+            ],
+            device=torch_device,
+        )
+
+        self.assertTrue(torch.allclose(logits, expected_tensor, atol=0.02))
+
+        # test the aggregation logits
+        logits_aggregation = outputs.logits_aggregation
+        expected_shape = torch.Size((1, 4))
+        self.assertEqual(logits_aggregation.shape, expected_shape)
+        expected_tensor = torch.tensor(
+            [[16.5659733, -3.06624889, -2.34152961, -0.970244825]], device=torch_device
+        )  # PyTorch model outputs [[16.5679, -3.0668, -2.3442, -0.9674]]
+
+        self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=0.003))
+
+    @slow
+    def test_inference_classification_head(self):
+        # note that google/tapas-base-finetuned-tabfact should correspond to tapas_tabfact_inter_masklm_base_reset
+        model = TapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact").to(torch_device)
+
+        tokenizer = self.default_tokenizer
+        table, queries = prepare_tapas_single_inputs_for_inference()
+        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="pt")
+        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
+        outputs = model(**inputs)
+
+        # test the classification logits
+        logits = outputs.logits
+        expected_shape = torch.Size((1, 2))
+        self.assertEqual(logits.shape, expected_shape)
+        expected_tensor = torch.tensor(
+            [[0.795137286, 9.5572]], device=torch_device
+        )  # Note that the PyTorch model outputs [[0.8057, 9.5281]]
+
+        self.assertTrue(torch.allclose(outputs.logits, expected_tensor, atol=0.05))
+
+
+# Below: tests for Tapas utilities which are defined in modeling_tapas.py.
+# These are based on segmented_tensor_test.py of the original implementation.
+# URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py
+@require_scatter
+class TapasUtilitiesTest(unittest.TestCase):
+    def _prepare_tables(self):
+        """Prepares two tables, both with three distinct rows.
+        The first table has two columns:
+        1.0, 2.0 | 3.0
+        2.0, 0.0 | 1.0
+        1.0, 3.0 | 4.0
+        The second table has three columns:
+        1.0 | 2.0 | 3.0
+        2.0 | 0.0 | 1.0
+        1.0 | 3.0 | 4.0
+        Returns:
+        SegmentedTensors with the tables.
+        """
+        values = torch.tensor(
+            [
+                [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
+                [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
+            ]
+        )
+        row_index = IndexMap(
+            indices=torch.tensor(
+                [
+                    [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+                    [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+                ]
+            ),
+            num_segments=3,
+            batch_dims=1,
+        )
+        col_index = IndexMap(
+            indices=torch.tensor(
+                [
+                    [[0, 0, 1], [0, 0, 1], [0, 0, 1]],
+                    [[0, 1, 2], [0, 1, 2], [0, 1, 2]],
+                ]
+            ),
+            num_segments=3,
+            batch_dims=1,
+        )
+        return values, row_index, col_index
+
+    def test_product_index(self):
+        _, row_index, col_index = self._prepare_tables()
+        cell_index = ProductIndexMap(row_index, col_index)
+        row_index_proj = cell_index.project_outer(cell_index)
+        col_index_proj = cell_index.project_inner(cell_index)
+
+        ind = cell_index.indices
+        self.assertEqual(cell_index.num_segments, 9)
+
+        # Projections should give back the original indices.
+        # we use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(row_index.indices.numpy(), row_index_proj.indices.numpy())
+        self.assertEqual(row_index.num_segments, row_index_proj.num_segments)
+        self.assertEqual(row_index.batch_dims, row_index_proj.batch_dims)
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(col_index.indices.numpy(), col_index_proj.indices.numpy())
+        self.assertEqual(col_index.batch_dims, col_index_proj.batch_dims)
+
+        # The first and second "column" are identified in the first table.
+        for i in range(3):
+            self.assertEqual(ind[0, i, 0], ind[0, i, 1])
+            self.assertNotEqual(ind[0, i, 0], ind[0, i, 2])
+
+        # All rows are distinct in the first table.
+        for i, i_2 in zip(range(3), range(3)):
+            for j, j_2 in zip(range(3), range(3)):
+                if i != i_2 and j != j_2:
+                    self.assertNotEqual(ind[0, i, j], ind[0, i_2, j_2])
+
+        # All cells are distinct in the second table.
+        for i, i_2 in zip(range(3), range(3)):
+            for j, j_2 in zip(range(3), range(3)):
+                if i != i_2 or j != j_2:
+                    self.assertNotEqual(ind[1, i, j], ind[1, i_2, j_2])
+
+    def test_flatten(self):
+        _, row_index, col_index = self._prepare_tables()
+        row_index_flat = flatten(row_index)
+        col_index_flat = flatten(col_index)
+
+        shape = [3, 4, 5]
+        batched_index = IndexMap(indices=torch.zeros(shape).type(torch.LongTensor), num_segments=1, batch_dims=3)
+        batched_index_flat = flatten(batched_index)
+
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(
+            row_index_flat.indices.numpy(), [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5]
+        )
+        np.testing.assert_array_equal(
+            col_index_flat.indices.numpy(), [0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 4, 5, 3, 4, 5, 3, 4, 5]
+        )
+        self.assertEqual(batched_index_flat.num_segments.numpy(), np.prod(shape))
+        np.testing.assert_array_equal(batched_index_flat.indices.numpy(), range(np.prod(shape)))
+
+    def test_range_index_map(self):
+        batch_shape = [3, 4]
+        num_segments = 5
+        index = range_index_map(batch_shape, num_segments)
+
+        self.assertEqual(num_segments, index.num_segments)
+        self.assertEqual(2, index.batch_dims)
+        indices = index.indices
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(list(indices.size()), [3, 4, 5])
+        for i in range(batch_shape[0]):
+            for j in range(batch_shape[1]):
+                # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+                np.testing.assert_array_equal(indices[i, j, :].numpy(), range(num_segments))
+
+    def test_reduce_sum(self):
+        values, row_index, col_index = self._prepare_tables()
+        cell_index = ProductIndexMap(row_index, col_index)
+        row_sum, _ = reduce_sum(values, row_index)
+        col_sum, _ = reduce_sum(values, col_index)
+        cell_sum, _ = reduce_sum(values, cell_index)
+
+        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
+        np.testing.assert_allclose(row_sum.numpy(), [[6.0, 3.0, 8.0], [6.0, 3.0, 8.0]])
+        np.testing.assert_allclose(col_sum.numpy(), [[9.0, 8.0, 0.0], [4.0, 5.0, 8.0]])
+        np.testing.assert_allclose(
+            cell_sum.numpy(),
+            [[3.0, 3.0, 0.0, 2.0, 1.0, 0.0, 4.0, 4.0, 0.0], [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0]],
+        )
+
+    def test_reduce_mean(self):
+        values, row_index, col_index = self._prepare_tables()
+        cell_index = ProductIndexMap(row_index, col_index)
+        row_mean, _ = reduce_mean(values, row_index)
+        col_mean, _ = reduce_mean(values, col_index)
+        cell_mean, _ = reduce_mean(values, cell_index)
+
+        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
+        np.testing.assert_allclose(
+            row_mean.numpy(), [[6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0], [6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0]]
+        )
+        np.testing.assert_allclose(col_mean.numpy(), [[9.0 / 6.0, 8.0 / 3.0, 0.0], [4.0 / 3.0, 5.0 / 3.0, 8.0 / 3.0]])
+        np.testing.assert_allclose(
+            cell_mean.numpy(),
+            [
+                [3.0 / 2.0, 3.0, 0.0, 2.0 / 2.0, 1.0, 0.0, 4.0 / 2.0, 4.0, 0.0],
+                [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0],
+            ],
+        )
+
+    def test_reduce_max(self):
+        values = torch.as_tensor([2.0, 1.0, 0.0, 3.0])
+        index = IndexMap(indices=torch.as_tensor([0, 1, 0, 1]), num_segments=2)
+        maximum, _ = reduce_max(values, index)
+
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(maximum.numpy(), [2, 3])
+
+    def test_reduce_sum_vectorized(self):
+        values = torch.as_tensor([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
+        index = IndexMap(indices=torch.as_tensor([0, 0, 1]), num_segments=2, batch_dims=0)
+        sums, new_index = reduce_sum(values, index)
+
+        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
+        np.testing.assert_allclose(sums.numpy(), [[3.0, 5.0, 7.0], [3.0, 4.0, 5.0]])
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(new_index.indices.numpy(), [0, 1])
+        np.testing.assert_array_equal(new_index.num_segments.numpy(), 2)
+        np.testing.assert_array_equal(new_index.batch_dims, 0)
+
+    def test_gather(self):
+        values, row_index, col_index = self._prepare_tables()
+        cell_index = ProductIndexMap(row_index, col_index)
+
+        # Compute sums and then gather. The result should have the same shape as
+        # the original table and each element should contain the sum the values in
+        # its cell.
+        sums, _ = reduce_sum(values, cell_index)
+        cell_sum = gather(sums, cell_index)
+        assert cell_sum.size() == values.size()
+
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_allclose(
+            cell_sum.numpy(),
+            [[[3.0, 3.0, 3.0], [2.0, 2.0, 1.0], [4.0, 4.0, 4.0]], [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]]],
+        )
+
+    def test_gather_vectorized(self):
+        values = torch.as_tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+        index = IndexMap(indices=torch.as_tensor([[0, 1], [1, 0]]), num_segments=2, batch_dims=1)
+        result = gather(values, index)
+
+        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+        np.testing.assert_array_equal(result.numpy(), [[[1, 2], [3, 4]], [[7, 8], [5, 6]]])
diff --git a/test_modeling_tf_albert.py b/test_modeling_tf_albert.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6b32ab849599dec13c1be4cbb99e3654bf6773
--- /dev/null
+++ b/test_modeling_tf_albert.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import AlbertConfig, is_tf_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TF_MODEL_FOR_PRETRAINING_MAPPING
+    from transformers.models.albert.modeling_tf_albert import (
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFAlbertForMaskedLM,
+        TFAlbertForMultipleChoice,
+        TFAlbertForPreTraining,
+        TFAlbertForQuestionAnswering,
+        TFAlbertForSequenceClassification,
+        TFAlbertForTokenClassification,
+        TFAlbertModel,
+    )
+
+
+class TFAlbertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        embedding_size=16,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.embedding_size = 16
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = AlbertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            embedding_size=self.embedding_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_albert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFAlbertModel(config=config)
+        # inputs = {'input_ids': input_ids,
+        #           'attention_mask': input_mask,
+        #           'token_type_ids': token_type_ids}
+        # sequence_output, pooled_output = model(**inputs)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_albert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFAlbertForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_albert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFAlbertForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_albert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFAlbertForSequenceClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_albert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFAlbertForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_albert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFAlbertForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
+
+    def create_and_check_albert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFAlbertForTokenClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFAlbertModel,
+            TFAlbertForPreTraining,
+            TFAlbertForMaskedLM,
+            TFAlbertForSequenceClassification,
+            TFAlbertForQuestionAnswering,
+            TFAlbertForTokenClassification,
+            TFAlbertForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["sentence_order_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFAlbertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_albert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_pretraining(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_lm_models = [TFAlbertForPreTraining, TFAlbertForMaskedLM]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in list_lm_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFAlbertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFAlbertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFAlbertForPreTraining.from_pretrained("albert-base-v2")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 30000]
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [4.595668, 0.74462754, -1.818147],
+                    [4.5954347, 0.7454184, -1.8188258],
+                    [4.5954905, 0.7448235, -1.8182316],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_auto.py b/test_modeling_tf_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb0b05f2c7da381a1f66321098de1d6e4ee15813
--- /dev/null
+++ b/test_modeling_tf_auto.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import tempfile
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, require_tf, slow
+
+
+if is_tf_available():
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        GPT2Config,
+        T5Config,
+        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForPreTraining,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelWithLMHead,
+        TFBertForMaskedLM,
+        TFBertForPreTraining,
+        TFBertForQuestionAnswering,
+        TFBertForSequenceClassification,
+        TFBertModel,
+        TFFunnelBaseModel,
+        TFFunnelModel,
+        TFGPT2LMHeadModel,
+        TFRobertaForMaskedLM,
+        TFT5ForConditionalGeneration,
+    )
+    from transformers.models.auto.modeling_tf_auto import (
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_PRETRAINING_MAPPING,
+        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        TF_MODEL_MAPPING,
+        TF_MODEL_WITH_LM_HEAD_MAPPING,
+    )
+    from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.t5.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+@require_tf
+class TFAutoModelTest(unittest.TestCase):
+    @slow
+    def test_model_from_pretrained(self):
+        import h5py
+
+        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
+
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertModel)
+
+    @slow
+    def test_model_for_pretraining_from_pretrained(self):
+        import h5py
+
+        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
+
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForPreTraining.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForPreTraining)
+
+    @slow
+    def test_model_for_causal_lm(self):
+        for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, GPT2Config)
+
+            model = TFAutoModelForCausalLM.from_pretrained(model_name)
+            model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFGPT2LMHeadModel)
+
+    @slow
+    def test_lmhead_model_from_pretrained(self):
+        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelWithLMHead.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+
+    @slow
+    def test_model_for_masked_lm(self):
+        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForMaskedLM.from_pretrained(model_name)
+            model, loading_info = TFAutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+
+    @slow
+    def test_model_for_encoder_decoder_lm(self):
+        for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, T5Config)
+
+            model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
+            model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFT5ForConditionalGeneration)
+
+    @slow
+    def test_sequence_classification_model_from_pretrained(self):
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForSequenceClassification)
+
+    @slow
+    def test_question_answering_model_from_pretrained(self):
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForQuestionAnswering)
+
+    def test_from_pretrained_identifier(self):
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, TFBertForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+    def test_from_identifier_from_model_type(self):
+        model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
+        self.assertIsInstance(model, TFRobertaForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = TFAutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, TFFunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = TFAutoModel.from_config(config)
+        self.assertIsInstance(model, TFFunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = TFAutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, TFFunnelBaseModel)
+
+    def test_parents_and_children_in_mappings(self):
+        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
+        # by the parents and will return the wrong configuration type when using auto models
+        mappings = (
+            TF_MODEL_MAPPING,
+            TF_MODEL_FOR_PRETRAINING_MAPPING,
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            TF_MODEL_WITH_LM_HEAD_MAPPING,
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+            TF_MODEL_FOR_MASKED_LM_MAPPING,
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        )
+
+        for mapping in mappings:
+            mapping = tuple(mapping.items())
+            for index, (child_config, child_model) in enumerate(mapping[1:]):
+                for parent_config, parent_model in mapping[: index + 1]:
+                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
+                        self.assertFalse(issubclass(child_config, parent_config))
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
diff --git a/test_modeling_tf_bart.py b/test_modeling_tf_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..e88659b9887d4dc34e2441b4753285ff6c3af5a7
--- /dev/null
+++ b/test_modeling_tf_bart.py
@@ -0,0 +1,477 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import BartConfig, BartTokenizer, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFBartForConditionalGeneration, TFBartModel
+
+
+@require_tf
+class TFBartModelTester:
+    config_cls = BartConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFBartModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        head_mask = inputs_dict["head_mask"]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_bart_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_tf
+class TFBartModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBartForConditionalGeneration, TFBartModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFBartForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_onnx = True
+    onnx_min_opset = 10
+
+    def setUp(self):
+        self.model_tester = TFBartModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+@require_tf
+class TFBartHeadTests(unittest.TestCase):
+    vocab_size = 99
+
+    def _get_config_and_data(self):
+        eos_column_vector = tf.ones((4, 1), dtype=tf.int32) * 2
+        input_ids = tf.concat([ids_tensor((4, 6), self.vocab_size - 3) + 3, eos_column_vector], axis=1)
+        batch_size = input_ids.shape[0]
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+            decoder_start_token_id=2,
+        )
+        return config, input_ids, batch_size
+
+    def test_lm_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
+        lm_model = TFBartForConditionalGeneration(config)
+        outputs = lm_model(input_ids=input_ids, labels=decoder_lm_labels, decoder_input_ids=input_ids, use_cache=False)
+        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+    def test_lm_uneven_forward(self):
+        config = BartConfig(
+            vocab_size=10,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+        )
+        lm_model = TFBartForConditionalGeneration(config)
+        context = tf.fill((7, 2), 4)
+        summary = tf.fill((7, 7), 6)
+        outputs = lm_model(input_ids=context, decoder_input_ids=summary, use_cache=False)
+        expected_shape = (*summary.shape, config.vocab_size)
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+
+@slow
+@require_tf
+class TFBartModelIntegrationTest(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large").model
+
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, model.config.pad_token_id), tf.int8)
+        output = model(input_ids=input_ids, attention_mask=attention_mask)[0]
+        expected_shape = (1, 11, 1024)
+        self.assertEqual(output.shape, expected_shape)
+        expected_slice = tf.convert_to_tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-3)
+
+    def test_cnn_summarization_same_as_fairseq_hard(self):
+        hf = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+        tok = self.tok
+
+        FRANCE_ARTICLE = ' Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
+        EXPECTED_SUMMARY_FRANCE = 'French prosecutor says he\'s not aware of any video footage from on board the plane. German daily Bild and French Paris Match claim to have found a cell phone video of the crash. A French Gendarmerie spokesman calls the reports "completely wrong" and "unwarranted" German airline Lufthansa confirms co-pilot Andreas Lubitz had battled depression.'
+
+        SHORTER_ARTICLE = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        EXPECTED_SUMMARY_SHORTER = "The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a move toward greater justice."
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        IRAN_ARTICLE = " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+        EXPECTED_SUMMARY_IRAN = "The U.S. and its negotiating partners reached a very strong framework agreement with Iran. Peter Bergen: The debate that has already begun will likely result in more heat than light. He says the agreement limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Bergen says the most important aim of a nuclear deal is preventing a nuclear Iran."
+
+        ARTICLE_SUBWAY = ' New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+        EXPECTED_SUMMARY_SUBWAY = "Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx. She was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the subway."
+
+        dct = tok(
+            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
+            max_length=1024,
+            truncation_strategy="only_first",
+            padding="longest",
+            truncation=True,
+            return_tensors="tf",
+        )
+        self.assertEqual(1024, dct["input_ids"].shape[1])
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"],
+            attention_mask=dct["attention_mask"],
+        )
+
+        assert hypotheses_batch[:, 1].numpy().tolist() == [0, 0, 0, 0]  # test force_bos_token_to_be_generated
+        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        expected_batch = [
+            EXPECTED_SUMMARY_FRANCE,
+            EXPECTED_SUMMARY_SHORTER,
+            EXPECTED_SUMMARY_IRAN,
+            EXPECTED_SUMMARY_SUBWAY,
+        ]
+        assert decoded == expected_batch
+
+    @cached_property
+    def tok(self):
+        return BartTokenizer.from_pretrained("facebook/bart-large")
+
+
+@slow
+@require_tf
+class FasterTFBartModelIntegrationTests(unittest.TestCase):
+    """These tests are useful for debugging since they operate on a model with 1 encoder layer and 1 decoder layer."""
+
+    @cached_property
+    def tok(self):
+        return BartTokenizer.from_pretrained("facebook/bart-large")
+
+    @cached_property
+    def xsum_1_1_model(self):
+        return TFBartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-1-1")
+
+    def test_xsum_1_1_generation(self):
+        model = self.xsum_1_1_model
+        assert model.model.decoder.embed_tokens._layer == model.model.shared
+        ARTICLE = 'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.'
+        EXPECTED = " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
+        dct = self.tok(ARTICLE, return_tensors="tf")
+        generated_ids = model.generate(**dct, num_beams=4)
+        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        assert result == EXPECTED
+
+    def test_xsum_1_1_batch_generation(self):
+        batch = self.tok(
+            [
+                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
+                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
+            ],
+            return_tensors="tf",
+            padding="longest",
+            truncation=True,
+        )
+        generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
+        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
+        assert (
+            result[0]
+            == " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
+        )
+        assert (
+            result[1]
+            == " An investigation into the crash that killed at least 10 people in the French capital has been released by the French police investigating the crash."
+        )
+
+    def test_encoder_equiv(self):
+        batch = self.tok(
+            [
+                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
+                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
+            ],
+            return_tensors="tf",
+            padding="longest",
+            truncation=True,
+        )
+        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
+
+        expected = np.array([[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]])
+        assert np.allclose(features[0, :3, :3].numpy(), expected, atol=1e-3)
diff --git a/test_modeling_tf_bert.py b/test_modeling_tf_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..639ba0be9d73970a545f673b0e09f86fbb76b166
--- /dev/null
+++ b/test_modeling_tf_bert.py
@@ -0,0 +1,390 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import BertConfig, is_tf_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TF_MODEL_FOR_PRETRAINING_MAPPING
+    from transformers.models.bert.modeling_tf_bert import (
+        TFBertForMaskedLM,
+        TFBertForMultipleChoice,
+        TFBertForNextSentencePrediction,
+        TFBertForPreTraining,
+        TFBertForQuestionAnswering,
+        TFBertForSequenceClassification,
+        TFBertForTokenClassification,
+        TFBertLMHeadModel,
+        TFBertModel,
+    )
+
+
+class TFBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = BertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_bert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        sequence_output, pooled_output = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_bert_lm_head(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.is_decoder = True
+        model = TFBertLMHeadModel(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        prediction_scores = model(inputs)["logits"]
+        self.parent.assertListEqual(
+            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_bert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForMaskedLM(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_bert_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForNextSentencePrediction(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+    def create_and_check_bert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+    def create_and_check_bert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFBertForSequenceClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_bert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFBertForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_bert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFBertForTokenClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_bert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFBertForQuestionAnswering(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFBertModel,
+            TFBertForMaskedLM,
+            TFBertLMHeadModel,
+            TFBertForNextSentencePrediction,
+            TFBertForPreTraining,
+            TFBertForQuestionAnswering,
+            TFBertForSequenceClassification,
+            TFBertForTokenClassification,
+            TFBertForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = True
+    onnx_min_opset = 10
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_lm_head(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
+
+    def test_model_from_pretrained(self):
+        model = TFBertModel.from_pretrained("jplu/tiny-tf-bert-random")
+        self.assertIsNotNone(model)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_lm_models = [TFBertForMaskedLM, TFBertForPreTraining, TFBertLMHeadModel]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in list_lm_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_custom_load_tf_weights(self):
+        model, output_loading_info = TFBertForTokenClassification.from_pretrained(
+            "jplu/tiny-tf-bert-random", output_loading_info=True
+        )
+        self.assertEqual(sorted(output_loading_info["unexpected_keys"]), [])
+        for layer in output_loading_info["missing_keys"]:
+            self.assertTrue(layer.split("_")[0] in ["dropout", "classifier"])
+
+
+@require_tf
+class TFBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFBertForPreTraining.from_pretrained("lysandre/tiny-bert-random")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 32000]
+        self.assertEqual(output.shape, expected_shape)
+
+        print(output[:, :3, :3])
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [-0.05243197, -0.04498899, 0.05512108],
+                    [-0.07444685, -0.01064632, 0.04352357],
+                    [-0.05020351, 0.05530146, 0.00700043],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_blenderbot.py b/test_modeling_tf_blenderbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..3870f1dff7e6705d2131f90e4f09b4118a01a623
--- /dev/null
+++ b/test_modeling_tf_blenderbot.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import BlenderbotConfig, BlenderbotTokenizer, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFAutoModelForSeq2SeqLM, TFBlenderbotForConditionalGeneration, TFBlenderbotModel
+
+
+@require_tf
+class TFBlenderbotModelTester:
+    config_cls = BlenderbotConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFBlenderbotModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        head_mask = inputs_dict["head_mask"]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_blenderbot_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_tf
+class TFBlenderbotModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFBlenderbotForConditionalGeneration, TFBlenderbotModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFBlenderbotForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFBlenderbotModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+@require_tokenizers
+@require_tf
+class TFBlenderbot400MIntegrationTests(unittest.TestCase):
+    src_text = ["My friends are cool but they eat too many carbs."]
+    model_name = "facebook/blenderbot-400M-distill"
+
+    @cached_property
+    def tokenizer(self):
+        return BlenderbotTokenizer.from_pretrained(self.model_name)
+
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+        return model
+
+    @slow
+    def test_generation_from_long_input(self):
+        model_inputs = self.tokenizer(self.src_text, return_tensors="tf")
+        generated_ids = self.model.generate(
+            model_inputs.input_ids,
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)[0]
+        assert (
+            generated_words
+            == " That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?"
+        )
diff --git a/test_modeling_tf_blenderbot_small.py b/test_modeling_tf_blenderbot_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d99a76ea2c3ab8b87c6cc872ce4bdf51ad4e3dd
--- /dev/null
+++ b/test_modeling_tf_blenderbot_small.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import BlenderbotSmallConfig, BlenderbotSmallTokenizer, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFAutoModelForSeq2SeqLM, TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel
+
+
+@require_tf
+class TFBlenderbotSmallModelTester:
+    config_cls = BlenderbotSmallConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFBlenderbotSmallModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        head_mask = inputs_dict["head_mask"]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_blenderbot_small_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_tf
+class TFBlenderbotSmallModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel) if is_tf_available() else ()
+    )
+    all_generative_model_classes = (TFBlenderbotSmallForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFBlenderbotSmallModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+@require_tokenizers
+@require_tf
+class TFBlenderbot90MIntegrationTests(unittest.TestCase):
+    src_text = [
+        "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like   i'm going to throw up.\nand why is that?"
+    ]
+    model_name = "facebook/blenderbot_small-90M"
+
+    @cached_property
+    def tokenizer(self):
+        # use "old" tokenizer here because of bug when downloading new tokenizer
+        return BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+        return model
+
+    @slow
+    def test_90_generation_from_long_input(self):
+        model_inputs = self.tokenizer(self.src_text, return_tensors="tf")
+        generated_ids = self.model.generate(
+            model_inputs.input_ids,
+            attention_mask=model_inputs.attention_mask,
+            num_beams=2,
+            use_cache=True,
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)[0]
+        assert generated_words in (
+            "i don't know. i just feel like i'm going to throw up. it's not fun.",
+            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
+            "i'm not sure. i just feel like i've been in a bad situation.",
+        )
diff --git a/test_modeling_tf_bort.py b/test_modeling_tf_bort.py
new file mode 100644
index 0000000000000000000000000000000000000000..8053afbd30cfc00f018f7d40d5ccbbafa1537f09
--- /dev/null
+++ b/test_modeling_tf_bort.py
@@ -0,0 +1,51 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import TFAutoModel
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFBortIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFAutoModel.from_pretrained("amazon/bort")
+
+        input_ids = tf.convert_to_tensor(
+            [[0, 18077, 4082, 7804, 8606, 6195, 2457, 3321, 11, 10489, 16, 269, 2579, 328, 2]],
+            dtype=tf.int32,
+        )  # Schloß Nymphenburg in Munich is really nice!
+
+        output = model(input_ids)["last_hidden_state"]
+        expected_shape = tf.TensorShape((1, 15, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [[[-0.0349, 0.0436, -1.8654], [-0.6964, 0.0835, -1.7393], [-0.9819, 0.2956, -0.2868]]],
+            dtype=tf.float32,
+        )
+
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/test_modeling_tf_camembert.py b/test_modeling_tf_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc542526852de7f248ec53416980735d8851e0ec
--- /dev/null
+++ b/test_modeling_tf_camembert.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import TFCamembertModel
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFCamembertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
+
+        input_ids = tf.convert_to_tensor(
+            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
+            dtype=tf.int32,
+        )  # J'aime le camembert !"
+
+        output = model(input_ids)["last_hidden_state"]
+        expected_shape = tf.TensorShape((1, 10, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
+            dtype=tf.float32,
+        )
+        # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
+        # camembert.eval()
+        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/test_modeling_tf_common.py b/test_modeling_tf_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..330d5c9124581ae2f2d05abd12a770b2e1f6272e
--- /dev/null
+++ b/test_modeling_tf_common.py
@@ -0,0 +1,1444 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import inspect
+import json
+import os
+import random
+import tempfile
+import unittest
+from importlib import import_module
+from typing import List, Tuple
+
+from huggingface_hub import HfApi
+from requests.exceptions import HTTPError
+from transformers import is_tf_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import (
+    ENDPOINT_STAGING,
+    PASS,
+    USER,
+    _tf_gpu_memory_limit,
+    is_pt_tf_cross_test,
+    is_staging_test,
+    require_onnx,
+    require_tf,
+    slow,
+    tooslow,
+)
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import (
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+        TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+        TF_MODEL_FOR_PRETRAINING_MAPPING,
+        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        BertConfig,
+        TFBertModel,
+        TFSharedEmbeddings,
+        tf_top_k_top_p_filtering,
+    )
+
+    if _tf_gpu_memory_limit is not None:
+        gpus = tf.config.list_physical_devices("GPU")
+        for gpu in gpus:
+            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
+            try:
+                tf.config.set_logical_device_configuration(
+                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
+                )
+                logical_gpus = tf.config.list_logical_devices("GPU")
+                print("Logical GPUs", logical_gpus)
+            except RuntimeError as e:
+                # Virtual devices must be set before GPUs have been initialized
+                print(e)
+
+
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if "_range" in key or "_std" in key:
+            setattr(configs_no_init, key, 0.0)
+    return configs_no_init
+
+
+@require_tf
+class TFModelTesterMixin:
+
+    model_tester = None
+    all_model_classes = ()
+    all_generative_model_classes = ()
+    test_resize_embeddings = True
+    test_head_masking = True
+    is_encoder_decoder = False
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+            inputs_dict = {
+                k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+                if isinstance(v, tf.Tensor) and v.ndim > 0
+                else v
+                for k, v in inputs_dict.items()
+            }
+
+        if return_labels:
+            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
+                inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+            elif model_class in [
+                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
+                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+            ]:
+                inputs_dict["labels"] = tf.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
+                )
+        return inputs_dict
+
+    def test_initialization(self):
+        pass
+
+    def test_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=False)
+                model = model_class.from_pretrained(tmpdirname)
+                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+                self.assert_outputs_same(after_outputs, outputs)
+
+    @tooslow
+    def test_graph_mode(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+
+            @tf.function
+            def run_in_graph_mode():
+                return model(inputs)
+
+            outputs = run_in_graph_mode()
+            self.assertIsNotNone(outputs)
+
+    @tooslow
+    def test_xla_mode(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+
+            @tf.function(experimental_compile=True)
+            def run_in_graph_mode():
+                return model(inputs)
+
+            outputs = run_in_graph_mode()
+            self.assertIsNotNone(outputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model.config.is_encoder_decoder:
+                expected_arg_names = [
+                    "input_ids",
+                    "attention_mask",
+                    "decoder_input_ids",
+                    "decoder_attention_mask",
+                ]
+                expected_arg_names.extend(
+                    ["head_mask", "decoder_head_mask"] if "head_mask" and "decoder_head_mask" in arg_names else []
+                )
+                # Necessary to handle BART with newly added cross_attn_head_mask
+                expected_arg_names.extend(
+                    ["cross_attn_head_mask", "encoder_outputs"]
+                    if "cross_attn_head_mask" in arg_names
+                    else ["encoder_outputs"]
+                )
+                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+            else:
+                expected_arg_names = ["input_ids"]
+                self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    @tooslow
+    def test_saved_model_creation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = False
+        config.output_attentions = False
+
+        if hasattr(config, "use_cache"):
+            config.use_cache = False
+
+        model_class = self.all_model_classes[0]
+
+        class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+        model = model_class(config)
+
+        model(class_inputs_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, saved_model=True)
+            saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+            self.assertTrue(os.path.exists(saved_model_dir))
+
+    @tooslow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = tf.keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+
+                if self.is_encoder_decoder:
+                    output_hidden_states = outputs["encoder_hidden_states"]
+                    output_attentions = outputs["encoder_attentions"]
+                else:
+                    output_hidden_states = outputs["hidden_states"]
+                    output_attentions = outputs["attentions"]
+
+                self.assertEqual(len(outputs), num_out)
+
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+
+                self.assertEqual(len(output_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(output_hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+
+                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(output_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_onnx_compliancy(self):
+        if not self.test_onnx:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        INTERNAL_OPS = [
+            "Assert",
+            "AssignVariableOp",
+            "EmptyTensorList",
+            "ReadVariableOp",
+            "ResourceGather",
+            "TruncatedNormal",
+            "VarHandleOp",
+            "VarIsInitializedOp",
+        ]
+        onnx_ops = []
+
+        with open(os.path.join(".", "utils", "tf_ops", "onnx.json")) as f:
+            onnx_opsets = json.load(f)["opsets"]
+
+        for i in range(1, self.onnx_min_opset + 1):
+            onnx_ops.extend(onnx_opsets[str(i)])
+
+        for model_class in self.all_model_classes:
+            model_op_names = set()
+
+            with tf.Graph().as_default() as g:
+                model = model_class(config)
+                model(model.dummy_inputs)
+
+                for op in g.get_operations():
+                    model_op_names.add(op.node_def.op)
+
+            model_op_names = sorted(model_op_names)
+            incompatible_ops = []
+
+            for op in model_op_names:
+                if op not in onnx_ops and op not in INTERNAL_OPS:
+                    incompatible_ops.append(op)
+
+            self.assertEqual(len(incompatible_ops), 0, incompatible_ops)
+
+    @require_onnx
+    @slow
+    def test_onnx_runtime_optimize(self):
+        if not self.test_onnx:
+            return
+
+        import keras2onnx
+        import onnxruntime
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model(model.dummy_inputs)
+
+            onnx_model = keras2onnx.convert_keras(model, model.name, target_opset=self.onnx_min_opset)
+
+            onnxruntime.InferenceSession(onnx_model.SerializeToString())
+
+    @tooslow
+    def test_mixed_precision(self):
+        tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            outputs = model(class_inputs_dict)
+
+            self.assertIsNotNone(outputs)
+
+        tf.keras.mixed_precision.experimental.set_policy("float32")
+
+    def test_keras_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        tf_main_layer_classes = set(
+            module_member
+            for model_class in self.all_model_classes
+            for module in (import_module(model_class.__module__),)
+            for module_member_name in dir(module)
+            if module_member_name.endswith("MainLayer")
+            for module_member in (getattr(module, module_member_name),)
+            if isinstance(module_member, type)
+            and tf.keras.layers.Layer in module_member.__bases__
+            and getattr(module_member, "_keras_serializable", False)
+        )
+        for main_layer_class in tf_main_layer_classes:
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                config.use_cache = inputs_dict.pop("use_cache", None)
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
+
+            symbolic_inputs = {
+                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+            }
+
+            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
+            outputs = model(inputs_dict)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filepath = os.path.join(tmpdirname, "keras_model.h5")
+                model.save(filepath)
+                if "T5" in main_layer_class.__name__:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    )
+                assert isinstance(model, tf.keras.Model)
+                after_outputs = model(inputs_dict)
+                self.assert_outputs_same(after_outputs, outputs)
+
+    def assert_outputs_same(self, after_outputs, outputs):
+        # Make sure we don't have nans
+        if isinstance(after_outputs, tf.Tensor):
+            out_1 = after_outputs.numpy()
+        elif isinstance(after_outputs, dict):
+            out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
+        else:
+            out_1 = after_outputs[0].numpy()
+        out_2 = outputs[0].numpy()
+        self.assertEqual(out_1.shape, out_2.shape)
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, 1e-5)
+
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+
+        import torch
+
+        import transformers
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            config.output_hidden_states = True
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    pt_inputs_dict[name] = key
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
+
+            tf_hidden_states = tfo[0].numpy()
+            pt_hidden_states = pto[0].numpy()
+
+            tf_nans = np.copy(np.isnan(tf_hidden_states))
+            pt_nans = np.copy(np.isnan(pt_hidden_states))
+
+            pt_hidden_states[tf_nans] = 0
+            tf_hidden_states[tf_nans] = 0
+            pt_hidden_states[pt_nans] = 0
+            tf_hidden_states[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
+            self.assertLessEqual(max_diff, 4e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = {}
+            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
+                if type(key) == bool:
+                    key = np.array(key, dtype=bool)
+                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
+                elif name == "input_values":
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
+                else:
+                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
+            tfo = tfo[0].numpy()
+            pto = pto[0].numpy()
+            tf_nans = np.copy(np.isnan(tfo))
+            pt_nans = np.copy(np.isnan(pto))
+
+            pto[tf_nans] = 0
+            tfo[tf_nans] = 0
+            pto[pt_nans] = 0
+            tfo[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tfo - pto))
+            self.assertLessEqual(max_diff, 4e-2)
+
+    @tooslow
+    def test_train_pipeline_custom_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # head_mask and decoder_head_mask has different shapes than other input args
+        if "head_mask" in inputs_dict:
+            del inputs_dict["head_mask"]
+        if "decoder_head_mask" in inputs_dict:
+            del inputs_dict["decoder_head_mask"]
+        if "cross_attn_head_mask" in inputs_dict:
+            del inputs_dict["cross_attn_head_mask"]
+        tf_main_layer_classes = set(
+            module_member
+            for model_class in self.all_model_classes
+            for module in (import_module(model_class.__module__),)
+            for module_member_name in dir(module)
+            if module_member_name.endswith("MainLayer")
+            for module_member in (getattr(module, module_member_name),)
+            if isinstance(module_member, type)
+            and tf.keras.layers.Layer in module_member.__bases__
+            and getattr(module_member, "_keras_serializable", False)
+        )
+
+        for main_layer_class in tf_main_layer_classes:
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(self.model_tester.vocab_size, self.model_tester.hidden_size, name="shared")
+                config.use_cache = False
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
+
+            symbolic_inputs = {
+                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+            }
+
+            if hasattr(self.model_tester, "num_labels"):
+                num_labels = self.model_tester.num_labels
+            else:
+                num_labels = 2
+
+            X = tf.data.Dataset.from_tensor_slices(
+                (inputs_dict, np.ones((self.model_tester.batch_size, self.model_tester.seq_length, num_labels, 1)))
+            ).batch(1)
+
+            hidden_states = main_layer(symbolic_inputs)[0]
+            outputs = tf.keras.layers.Dense(num_labels, activation="softmax", name="outputs")(hidden_states)
+            model = tf.keras.models.Model(inputs=symbolic_inputs, outputs=[outputs])
+
+            model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"])
+            model.fit(X, epochs=1)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filepath = os.path.join(tmpdirname, "keras_model.h5")
+                model.save(filepath)
+                if "T5" in main_layer_class.__name__:
+                    model = tf.keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = tf.keras.models.load_model(
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    )
+                assert isinstance(model, tf.keras.Model)
+                model(inputs_dict)
+
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        max_input = getattr(self.model_tester, "max_position_embeddings", 512)
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+
+        for model_class in self.all_model_classes:
+            if self.is_encoder_decoder:
+                input_ids = {
+                    "decoder_input_ids": tf.keras.Input(
+                        batch_shape=(2, max_input),
+                        name="decoder_input_ids",
+                        dtype="int32",
+                    ),
+                    "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
+                }
+            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+                input_ids = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
+            else:
+                input_ids = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
+
+            # Prepare our model
+            model = model_class(config)
+            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+            # Let's load it from the disk to be sure we can use pretrained weights
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=False)
+                model = model_class.from_pretrained(tmpdirname)
+
+            outputs_dict = model(input_ids)
+            hidden_states = outputs_dict[0]
+
+            # Add a dense layer on top to test integration with other keras modules
+            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+
+            # Compile extended model
+            extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def test_keyword_and_dict_args(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            outputs_dict = model(inputs)
+
+            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            input_ids = inputs_keywords.pop("input_ids", None)
+            outputs_keywords = model(input_ids, **inputs_keywords)
+            output_dict = outputs_dict[0].numpy()
+            output_keywords = outputs_keywords[0].numpy()
+
+            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        def check_decoder_attentions_output(outputs):
+            out_len = len(outputs)
+            self.assertEqual(min(out_len % 2, out_len % 5), 0)  # differentiation due to newly added cross_attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+        def check_encoder_attentions_output(outputs):
+            attentions = [
+                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
+            ]
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["use_cache"] = False
+            config.output_hidden_states = False
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            out_len = len(outputs)
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            if self.is_encoder_decoder:
+                model = model_class(config)
+                outputs = model(self._prepare_for_class(inputs_dict, model_class))
+                self.assertEqual(config.output_hidden_states, False)
+                check_decoder_attentions_output(outputs)
+
+            # Check that output attentions can also be changed via the config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_hidden_states, True)
+            check_encoder_attentions_output(outputs)
+
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            return
+
+        random.Random().seed(42)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        random.Random().seed()
+
+        inputs_dict["output_attentions"] = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+
+            # Prepare head_mask
+            def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
+                if i == 0:
+                    return tf.concat(
+                        (tf.zeros(1, dtype=tf.float32), tf.ones(attention_heads - 1, dtype=tf.float32)), 0
+                    )
+                elif i == num_hidden_layers - 1:
+                    return tf.concat(
+                        (tf.zeros(attention_heads - 1, dtype=tf.float32), tf.ones(1, dtype=tf.float32)), 0
+                    )
+                else:
+                    return tf.ones(attention_heads, dtype=tf.float32)
+
+            head_mask = tf.stack(
+                [
+                    prepare_layer_head_mask(i, config.num_attention_heads, config.num_hidden_layers)
+                    for i in range(config.num_hidden_layers)
+                ],
+                0,
+            )
+
+            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
+            inputs["head_mask"] = head_mask
+            if model.config.is_encoder_decoder:
+                signature = inspect.signature(model.call)
+                arg_names = [*signature.parameters.keys()]
+                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                    inputs["decoder_head_mask"] = head_mask
+                if "cross_attn_head_mask" in arg_names:
+                    inputs["cross_attn_head_mask"] = head_mask
+
+            outputs = model(**inputs, return_dict=True)
+
+            def check_attentions_validity(attentions):
+                # Remove Nan
+                for t in attentions:
+                    self.assertLess(
+                        (tf.math.reduce_sum(tf.cast(tf.math.is_nan(t), tf.float32))).numpy(), (tf.size(t) / 4).numpy()
+                    )  # Check we don't have more than 25% nans (arbitrary)
+
+                attentions = [
+                    tf.where(tf.math.is_nan(t), 0.0, t) for t in attentions
+                ]  # remove them (the test is less complete)
+
+                self.assertAlmostEqual(tf.math.reduce_sum(attentions[0][..., 0, :, :]).numpy(), 0.0)
+                self.assertNotEqual(tf.math.reduce_sum(attentions[0][..., -1, :, :]).numpy(), 0.0)
+                if len(attentions) > 2:  # encoder-decodere models have only 2 layers in each modules
+                    self.assertNotEqual(tf.math.reduce_sum(attentions[1][..., 0, :, :]).numpy(), 0.0)
+                self.assertAlmostEqual(tf.math.reduce_sum(attentions[-1][..., -2, :, :]).numpy(), 0.0)
+                self.assertNotEqual(tf.math.reduce_sum(attentions[-1][..., -1, :, :]).numpy(), 0.0)
+
+            if model.config.is_encoder_decoder:
+                check_attentions_validity(outputs.encoder_attentions)
+                check_attentions_validity(outputs.decoder_attentions)
+                if "cross_attn_head_mask" in arg_names:
+                    check_attentions_validity(outputs.cross_attentions)
+            else:
+                check_attentions_validity(outputs.attentions)
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_hidden_states_output(config, inputs_dict, model_class):
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+
+            if model.config.is_encoder_decoder:
+                encoder_hidden_states = outputs.encoder_hidden_states
+                decoder_hidden_states = outputs.decoder_hidden_states
+
+                self.assertEqual(config.output_attentions, False)
+                self.assertEqual(len(encoder_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(encoder_hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+                self.assertEqual(len(decoder_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(decoder_hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+            else:
+                hidden_states = outputs.hidden_states
+                self.assertEqual(config.output_attentions, False)
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_lm_models = (
+            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
+        )
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in list_lm_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            first, second = (
+                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+                model(self._prepare_for_class(inputs_dict, model_class), training=False)[0],
+            )
+            out_1 = first.numpy()
+            out_2 = second.numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+    def test_model_outputs_equivalence(self):
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+            )
+
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            inputs = copy.deepcopy(inputs_dict)
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
+            else:
+                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
+
+            inputs = self._prepare_for_class(inputs, model_class)
+
+            model(inputs)
+
+    @tooslow
+    def test_graph_mode_with_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            inputs = copy.deepcopy(inputs_dict)
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = model.get_input_embeddings()(input_ids)
+            else:
+                inputs["inputs_embeds"] = model.get_input_embeddings()(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = model.get_input_embeddings()(decoder_input_ids)
+
+            inputs = self._prepare_for_class(inputs, model_class)
+
+            @tf.function
+            def run_in_graph_mode():
+                return model(inputs)
+
+            outputs = run_in_graph_mode()
+            self.assertIsNotNone(outputs)
+
+    def test_numpy_arrays_inputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def prepare_numpy_arrays(inputs_dict):
+            inputs_np_dict = {}
+            for k, v in inputs_dict.items():
+                if tf.is_tensor(v):
+                    inputs_np_dict[k] = v.numpy()
+                else:
+                    inputs_np_dict[k] = np.array(k)
+
+            return inputs_np_dict
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            inputs_np = prepare_numpy_arrays(inputs)
+
+            model(inputs_np)
+
+    def test_resize_token_embeddings(self):
+        if not self.test_resize_embeddings:
+            return
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            embeds = getattr(embedding_layer, "weight", None)
+            if embeds is not None:
+                return embeds
+
+            embeds = getattr(embedding_layer, "decoder", None)
+            if embeds is not None:
+                return embeds
+
+            model(model.dummy_inputs)
+
+            embeds = getattr(embedding_layer, "weight", None)
+            if embeds is not None:
+                return embeds
+
+            embeds = getattr(embedding_layer, "decoder", None)
+            if embeds is not None:
+                return embeds
+
+            return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_bias = model.get_bias()
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_bias = model.get_bias()
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_bias is not None and new_bias is not None:
+                    for old_weight, new_weight in zip(old_bias.values(), new_bias.values()):
+                        self.assertEqual(new_weight.shape[0], assert_size)
+
+                        models_equal = True
+                        for p1, p2 in zip(old_weight.value(), new_weight.value()):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                        self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+                    self.assertEqual(new_output_embeddings.shape[1], old_output_embeddings.shape[1])
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+    def test_lm_head_model_random_no_beam_search_generate(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+
+        # iterate over all generative models
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                # if bos token id is not defined mobel needs input_ids
+                with self.assertRaises(AssertionError):
+                    model.generate(do_sample=True, max_length=5)
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True))
+            else:
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
+
+            with self.assertRaises(AssertionError):
+                # generating multiple sequences when no beam search generation
+                # is not allowed as it would always generate the same sequences
+                model.generate(input_ids, do_sample=False, num_return_sequences=2)
+
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
+
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            output_tokens = model.generate(
+                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+
+    def test_lm_head_model_random_beam_search_generate(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict.get("input_ids", None)
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                # if bos token id is not defined mobel needs input_ids, num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
+            else:
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
+
+            with self.assertRaises(AssertionError):
+                # generating more sequences than having beams leads is not possible
+                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
+
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(
+                model.generate(
+                    input_ids,
+                    do_sample=True,
+                    num_beams=2,
+                    num_return_sequences=2,
+                )
+            )
+            # num_return_sequences > 1, greedy
+            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
+
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            output_tokens = model.generate(
+                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+
+    def test_loss_computation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            if getattr(model, "compute_loss", None):
+                # The number of elements in the loss should be the same as the number of elements in the label
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                added_label = prepared_for_class[
+                    sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                ]
+                loss_size = tf.size(added_label)
+
+                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
+                    # if loss is causal lm loss, labels are shift, so that one label per batch
+                    # is cut
+                    loss_size = loss_size - self.model_tester.batch_size
+
+                # Test that model correctly compute the loss with kwargs
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                input_ids = prepared_for_class.pop("input_ids")
+
+                loss = model(input_ids, **prepared_for_class)[0]
+                self.assertEqual(loss.shape, [loss_size])
+
+                # Test that model correctly compute the loss with a dict
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+                loss = model(prepared_for_class)[0]
+                self.assertEqual(loss.shape, [loss_size])
+
+                # Test that model correctly compute the loss with a tuple
+                prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
+
+                # Get keys that were added with the _prepare_for_class function
+                label_keys = prepared_for_class.keys() - inputs_dict.keys()
+                signature = inspect.signature(model.call).parameters
+                signature_names = list(signature.keys())
+
+                # Create a dictionary holding the location of the tensors in the tuple
+                tuple_index_mapping = {0: "input_ids"}
+                for label_key in label_keys:
+                    label_key_index = signature_names.index(label_key)
+                    tuple_index_mapping[label_key_index] = label_key
+                sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
+                # Initialize a list with their default values, update the values and convert to a tuple
+                list_input = []
+
+                for name in signature_names:
+                    if name != "kwargs":
+                        list_input.append(signature[name].default)
+
+                for index, value in sorted_tuple_index_mapping:
+                    list_input[index] = prepared_for_class[value]
+
+                tuple_input = tuple(list_input)
+
+                # Send to model
+                loss = model(tuple_input[:-1])[0]
+
+                self.assertEqual(loss.shape, [loss_size])
+
+    def test_generate_with_headmasking(self):
+        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            # We want to test only encoder-decoder models
+            if not config.is_encoder_decoder:
+                continue
+
+            head_masking = {
+                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
+                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
+                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
+            }
+
+            signature = inspect.signature(model.call)
+            if set(head_masking.keys()) < set([*signature.parameters.keys()]):
+                continue
+
+            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
+                out = model.generate(
+                    inputs_dict["input_ids"],
+                    num_beams=1,
+                    max_length=inputs_dict["input_ids"] + 5,
+                    output_attentions=True,
+                    return_dict_in_generate=True,
+                    **{name: mask},
+                )
+                # We check the state of decoder_attentions and cross_attentions just from the last step
+                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
+                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
+
+    def _generate_random_bad_tokens(self, num_bad_tokens, model):
+        # special tokens cannot be bad tokens
+        special_tokens = []
+        if model.config.bos_token_id is not None:
+            special_tokens.append(model.config.bos_token_id)
+        if model.config.pad_token_id is not None:
+            special_tokens.append(model.config.pad_token_id)
+        if model.config.eos_token_id is not None:
+            special_tokens.append(model.config.eos_token_id)
+
+        # create random bad tokens that are not special tokens
+        bad_tokens = []
+        while len(bad_tokens) < num_bad_tokens:
+            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
+            if token not in special_tokens:
+                bad_tokens.append(token)
+        return bad_tokens
+
+    def _check_generated_ids(self, output_ids):
+        for token_id in output_ids[0].numpy().tolist():
+            self.assertGreaterEqual(token_id, 0)
+            self.assertLess(token_id, self.model_tester.vocab_size)
+
+    def _check_match_tokens(self, generated_ids, bad_words_ids):
+        # for all bad word tokens
+        for bad_word_ids in bad_words_ids:
+            # for all slices in batch
+            for generated_ids_slice in generated_ids:
+                # for all word idx
+                for i in range(len(bad_word_ids), len(generated_ids_slice)):
+                    # if tokens match
+                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
+                        return True
+        return False
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32)
+
+    return output
+
+
+@require_tf
+class UtilsFunctionsTest(unittest.TestCase):
+
+    # tests whether the top_k_top_p_filtering function behaves as expected
+    def test_top_k_top_p_filtering(self):
+        logits = tf.convert_to_tensor(
+            [
+                [
+                    8.2220991,  # 3rd highest value; idx. 0
+                    -0.5620044,
+                    5.23229752,
+                    4.0386393,
+                    -6.8798378,
+                    -0.54785802,
+                    -3.2012153,
+                    2.92777176,
+                    1.88171953,
+                    7.35341276,  # 5th highest value; idx. 9
+                    8.43207833,  # 2nd highest value; idx. 10
+                    -9.85711836,
+                    -5.96209236,
+                    -1.13039161,
+                    -7.1115294,
+                    -0.8369633,
+                    -5.3186408,
+                    7.06427407,
+                    0.81369344,
+                    -0.82023817,
+                    -5.9179796,
+                    0.58813443,
+                    -6.99778438,
+                    4.71551189,
+                    -0.18771637,
+                    7.44020759,  # 4th highest value; idx. 25
+                    9.38450987,  # 1st highest value; idx. 26
+                    2.12662941,
+                    -9.32562038,
+                    2.35652522,
+                ],  # cummulative prob of 5 highest values <= 0.6
+                [
+                    0.58425518,
+                    4.53139238,
+                    -5.57510464,
+                    -6.28030699,
+                    -7.19529503,
+                    -4.02122551,
+                    1.39337037,
+                    -6.06707057,
+                    1.59480517,
+                    -9.643119,
+                    0.03907799,
+                    0.67231762,
+                    -8.88206726,
+                    6.27115922,  # 4th highest value; idx. 13
+                    2.28520723,
+                    4.82767506,
+                    4.30421368,
+                    8.8275313,  # 2nd highest value; idx. 17
+                    5.44029958,  # 5th highest value; idx. 18
+                    -4.4735794,
+                    7.38579536,  # 3rd highest value; idx. 20
+                    -2.91051663,
+                    2.61946077,
+                    -2.5674762,
+                    -9.48959302,
+                    -4.02922645,
+                    -1.35416918,
+                    9.67702323,  # 1st highest value; idx. 27
+                    -5.89478553,
+                    1.85370467,
+                ],  # cummulative prob of 5 highest values <= 0.6
+            ],
+            dtype=tf.float32,
+        )
+
+        non_inf_expected_idx = tf.convert_to_tensor(
+            [[0, 0], [0, 9], [0, 10], [0, 25], [0, 26], [1, 13], [1, 17], [1, 18], [1, 20], [1, 27]],
+            dtype=tf.int32,
+        )  # expected non filtered idx as noted above
+
+        non_inf_expected_output = tf.convert_to_tensor(
+            [8.222099, 7.3534126, 8.432078, 7.4402075, 9.38451, 6.271159, 8.827531, 5.4402995, 7.3857956, 9.677023],
+            dtype=tf.float32,
+        )  # expected non filtered values as noted above
+
+        output = tf_top_k_top_p_filtering(logits, top_k=10, top_p=0.6, min_tokens_to_keep=4)
+
+        non_inf_output = output[output != -float("inf")]
+        non_inf_idx = tf.cast(
+            tf.where(tf.not_equal(output, tf.constant(-float("inf"), dtype=tf.float32))),
+            dtype=tf.int32,
+        )
+
+        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
+        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)
+
+
+@require_tf
+@is_staging_test
+class TFModelPushToHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._api = HfApi(endpoint=ENDPOINT_STAGING)
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-model-tf")
+        except HTTPError:
+            pass
+
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-model-tf-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        # Make sure model is properly initialized
+        _ = model(model.dummy_inputs)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir, push_to_hub=True, repo_name="test-model-tf", use_auth_token=self._token)
+
+            new_model = TFBertModel.from_pretrained(f"{USER}/test-model-tf")
+            models_equal = True
+            for p1, p2 in zip(model.weights, new_model.weights):
+                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+
+    def test_push_to_hub_in_organization(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        model = TFBertModel(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(
+                tmp_dir,
+                push_to_hub=True,
+                repo_name="test-model-tf-org",
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
+
+            new_model = TFBertModel.from_pretrained("valid_org/test-model-tf-org")
+            models_equal = True
+            for p1, p2 in zip(model.weights, new_model.weights):
+                if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
diff --git a/test_modeling_tf_convbert.py b/test_modeling_tf_convbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..e882bc64fd6c6b167ca846e3758350e773e94eaa
--- /dev/null
+++ b/test_modeling_tf_convbert.py
@@ -0,0 +1,411 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+
+from transformers import ConvBertConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFConvBertForMaskedLM,
+        TFConvBertForMultipleChoice,
+        TFConvBertForQuestionAnswering,
+        TFConvBertForSequenceClassification,
+        TFConvBertForTokenClassification,
+        TFConvBertModel,
+    )
+
+
+class TFConvBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 384
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.embedding_size = 128
+        self.head_ratio = 2
+        self.conv_kernel_size = 9
+        self.num_groups = 1
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = ConvBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            return_dict=True,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFConvBertModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFConvBertForMaskedLM(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFConvBertForSequenceClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFConvBertForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFConvBertForTokenClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFConvBertForQuestionAnswering(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFConvBertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFConvBertModel,
+            TFConvBertForMaskedLM,
+            TFConvBertForQuestionAnswering,
+            TFConvBertForSequenceClassification,
+            TFConvBertForTokenClassification,
+            TFConvBertForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFConvBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = tf.keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+
+                if self.is_encoder_decoder:
+                    output_hidden_states = outputs["encoder_hidden_states"]
+                    output_attentions = outputs["encoder_attentions"]
+                else:
+                    output_hidden_states = outputs["hidden_states"]
+                    output_attentions = outputs["attentions"]
+
+                self.assertEqual(len(outputs), num_out)
+
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+
+                self.assertEqual(len(output_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(output_hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+
+                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(output_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length],
+                )
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base")
+        self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        def check_decoder_attentions_output(outputs):
+            out_len = len(outputs)
+            self.assertEqual(out_len % 2, 0)
+            decoder_attentions = outputs.decoder_attentions
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads / 2, decoder_seq_length, decoder_key_length],
+            )
+
+        def check_encoder_attentions_output(outputs):
+            attentions = [
+                t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions)
+            ]
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["use_cache"] = False
+            config.output_hidden_states = False
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            out_len = len(outputs)
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            if self.is_encoder_decoder:
+                model = model_class(config)
+                outputs = model(self._prepare_for_class(inputs_dict, model_class))
+                self.assertEqual(config.output_hidden_states, False)
+                check_decoder_attentions_output(outputs)
+
+            # Check that output attentions can also be changed via the config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_hidden_states, True)
+            check_encoder_attentions_output(outputs)
+
+
+@require_tf
+class TFConvBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 768]
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [-0.03475493, -0.4686034, -0.30638832],
+                    [0.22637248, -0.26988646, -0.7423424],
+                    [0.10324868, -0.45013508, -0.58280784],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_ctrl.py b/test_modeling_tf_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9531552bd3b15bc23511bf4076d00229d0cc7dd
--- /dev/null
+++ b/test_modeling_tf_ctrl.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import CTRLConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.ctrl.modeling_tf_ctrl import (
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFCTRLForSequenceClassification,
+        TFCTRLLMHeadModel,
+        TFCTRLModel,
+    )
+
+
+class TFCTRLModelTester(object):
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = CTRLConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFCTRLModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFCTRLLMHeadModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_ctrl_for_sequence_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        config.num_labels = self.num_labels
+        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        inputs = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "labels": sequence_labels,
+        }
+        model = TFCTRLForSequenceClassification(config)
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel, TFCTRLForSequenceClassification) if is_tf_available() else ()
+    all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFCTRLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_ctrl_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
+
+    def test_ctrl_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
+
+    def test_ctrl_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_ctrl_for_sequence_classification(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_lm_models = [TFCTRLLMHeadModel]
+        list_other_models_with_output_ebd = [TFCTRLForSequenceClassification]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in list_lm_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            elif model_class in list_other_models_with_output_ebd:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFCTRLModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFCTRLModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_ctrl(self):
+        model = TFCTRLLMHeadModel.from_pretrained("ctrl")
+        input_ids = tf.convert_to_tensor([[11859, 0, 1611, 8]], dtype=tf.int32)  # Legal the president is
+        expected_output_ids = [
+            11859,
+            0,
+            1611,
+            8,
+            5,
+            150,
+            26449,
+            2,
+            19,
+            348,
+            469,
+            3,
+            2595,
+            48,
+            20740,
+            246533,
+            246533,
+            19,
+            30,
+            5,
+        ]  # Legal the president is a good guy and I don't want to lose my job. \n \n I have a
+
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_tf_distilbert.py b/test_modeling_tf_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..23a8f29d1282390186fece59cd884546fea09714
--- /dev/null
+++ b/test_modeling_tf_distilbert.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import DistilBertConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.distilbert.modeling_tf_distilbert import (
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFDistilBertForMaskedLM,
+        TFDistilBertForMultipleChoice,
+        TFDistilBertForQuestionAnswering,
+        TFDistilBertForSequenceClassification,
+        TFDistilBertForTokenClassification,
+        TFDistilBertModel,
+    )
+
+
+class TFDistilBertModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = False
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = DistilBertConfig(
+            vocab_size=self.vocab_size,
+            dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            hidden_dim=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_distilbert_model(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDistilBertModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+
+        result = model(inputs)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_distilbert_for_masked_lm(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDistilBertForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_distilbert_for_question_answering(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDistilBertForQuestionAnswering(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_distilbert_for_sequence_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFDistilBertForSequenceClassification(config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_distilbert_for_multiple_choice(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFDistilBertForMultipleChoice(config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_distilbert_for_token_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFDistilBertForTokenClassification(config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFDistilBertModel,
+            TFDistilBertForMaskedLM,
+            TFDistilBertForQuestionAnswering,
+            TFDistilBertForSequenceClassification,
+            TFDistilBertForTokenClassification,
+            TFDistilBertForMultipleChoice,
+        )
+        if is_tf_available()
+        else None
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFDistilBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_distilbert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]):
+            model = TFDistilBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFDistilBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 768]
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [0.19261885, -0.13732955, 0.4119799],
+                    [0.22150156, -0.07422661, 0.39037204],
+                    [0.22756018, -0.0896414, 0.3701467],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_dpr.py b/test_modeling_tf_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e82fd3ab5bfb256d1fdf8e6f631c0cf13466d2
--- /dev/null
+++ b/test_modeling_tf_dpr.py
@@ -0,0 +1,258 @@
+# coding=utf-8
+# Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import numpy
+    import tensorflow as tf
+
+    from transformers import (
+        TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BertConfig,
+        DPRConfig,
+        TFDPRContextEncoder,
+        TFDPRQuestionEncoder,
+        TFDPRReader,
+    )
+
+
+class TFDPRModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        projection_dim=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.projection_dim = projection_dim
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor(
+                [self.batch_size, self.seq_length], vocab_size=2
+            )  # follow test_modeling_tf_ctrl.py
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = BertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+        config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict())
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_dpr_context_encoder(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDPRContextEncoder(config=config)
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
+
+    def create_and_check_dpr_question_encoder(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDPRQuestionEncoder(config=config)
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
+
+    def create_and_check_dpr_reader(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFDPRReader(config=config)
+        result = model(input_ids, attention_mask=input_mask)
+
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids}
+        return config, inputs_dict
+
+
+@require_tf
+class TFDPRModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFDPRContextEncoder,
+            TFDPRQuestionEncoder,
+            TFDPRReader,
+        )
+        if is_tf_available()
+        else ()
+    )
+
+    test_resize_embeddings = False
+    test_missing_keys = False
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFDPRModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_dpr_context_encoder_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_dpr_context_encoder(*config_and_inputs)
+
+    def test_dpr_question_encoder_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_dpr_question_encoder(*config_and_inputs)
+
+    def test_dpr_reader_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_dpr_reader(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFDPRContextEncoder.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+        for model_name in TF_DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFDPRContextEncoder.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+        for model_name in TF_DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFDPRQuestionEncoder.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+        for model_name in TF_DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFDPRReader.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFDPRModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+
+        input_ids = tf.constant(
+            [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]]
+        )  # [CLS] hello, is my dog cute? [SEP]
+        output = model(input_ids)[0]  # embedding shape = (1, 768)
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [
+                [
+                    0.03236253,
+                    0.12753335,
+                    0.16818509,
+                    0.00279786,
+                    0.3896933,
+                    0.24264945,
+                    0.2178971,
+                    -0.02335227,
+                    -0.08481959,
+                    -0.14324117,
+                ]
+            ]
+        )
+        self.assertTrue(numpy.allclose(output[:, :10].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/test_modeling_tf_electra.py b/test_modeling_tf_electra.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f6272023617776268a9806c6c3622444e961e7b
--- /dev/null
+++ b/test_modeling_tf_electra.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import ElectraConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.electra.modeling_tf_electra import (
+        TFElectraForMaskedLM,
+        TFElectraForMultipleChoice,
+        TFElectraForPreTraining,
+        TFElectraForQuestionAnswering,
+        TFElectraForSequenceClassification,
+        TFElectraForTokenClassification,
+        TFElectraModel,
+    )
+
+
+class TFElectraModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.embedding_size = 128
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = ElectraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_electra_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_electra_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_electra_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_electra_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFElectraForSequenceClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_electra_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFElectraForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_electra_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFElectraForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_electra_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFElectraForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFElectraModel,
+            TFElectraForMaskedLM,
+            TFElectraForPreTraining,
+            TFElectraForTokenClassification,
+            TFElectraForMultipleChoice,
+            TFElectraForSequenceClassification,
+            TFElectraForQuestionAnswering,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFElectraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_electra_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_sequence_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        # for model_name in TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["google/electra-small-discriminator"]:
+            model = TFElectraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFElectraModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFElectraForPreTraining.from_pretrained("lysandre/tiny-electra-random")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6]
+        self.assertEqual(output.shape, expected_shape)
+
+        print(output[:, :3])
+
+        expected_slice = tf.constant([[-0.24651965, 0.8835437, 1.823782]])
+        tf.debugging.assert_near(output[:, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_flaubert.py b/test_modeling_tf_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f053ca745cdba90fc040c6203ad8368f7eb91
--- /dev/null
+++ b/test_modeling_tf_flaubert.py
@@ -0,0 +1,363 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import (
+        TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        FlaubertConfig,
+        TFFlaubertForMultipleChoice,
+        TFFlaubertForQuestionAnsweringSimple,
+        TFFlaubertForSequenceClassification,
+        TFFlaubertForTokenClassification,
+        TFFlaubertModel,
+        TFFlaubertWithLMHeadModel,
+    )
+
+
+class TFFlaubertModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_lengths = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.gelu_activation = True
+        self.sinusoidal_embeddings = False
+        self.causal = False
+        self.asm = False
+        self.n_langs = 2
+        self.vocab_size = 99
+        self.n_special = 0
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.summary_type = "last"
+        self.use_proj = True
+        self.scope = None
+        self.bos_token_id = 0
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+
+        input_lengths = None
+        if self.use_input_lengths:
+            input_lengths = (
+                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+            )  # small variation of seq_length
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+        sequence_labels = None
+        token_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = FlaubertConfig(
+            vocab_size=self.vocab_size,
+            n_special=self.n_special,
+            emb_dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            gelu_activation=self.gelu_activation,
+            sinusoidal_embeddings=self.sinusoidal_embeddings,
+            asm=self.asm,
+            causal=self.causal,
+            n_langs=self.n_langs,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            summary_type=self.summary_type,
+            use_proj=self.use_proj,
+            bos_token_id=self.bos_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        )
+
+    def create_and_check_flaubert_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertModel(config=config)
+        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_flaubert_lm_head(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertWithLMHeadModel(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
+        result = model(inputs)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_flaubert_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertForQuestionAnsweringSimple(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths}
+
+        result = model(inputs)
+
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_flaubert_sequence_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFFlaubertForSequenceClassification(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths}
+
+        result = model(inputs)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def create_and_check_flaubert_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFFlaubertForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_flaubert_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = TFFlaubertForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "langs": token_type_ids,
+            "lengths": input_lengths,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFFlaubertModel,
+            TFFlaubertWithLMHeadModel,
+            TFFlaubertForSequenceClassification,
+            TFFlaubertForQuestionAnsweringSimple,
+            TFFlaubertForTokenClassification,
+            TFFlaubertForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (TFFlaubertWithLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFFlaubertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_flaubert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
+
+    def test_flaubert_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs)
+
+    def test_flaubert_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_qa(*config_and_inputs)
+
+    def test_flaubert_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_flaubert_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFFlaubertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFFlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased")
+
+        input_ids = tf.convert_to_tensor(
+            [[0, 158, 735, 2592, 1424, 6727, 82, 1]],
+            dtype=tf.int32,
+        )  # "J'aime flaubert !"
+
+        output = model(input_ids)[0]
+        expected_shape = tf.TensorShape((1, 8, 512))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [
+                [
+                    [-1.8768773, -1.566555, 0.27072418],
+                    [-1.6920038, -0.5873505, 1.9329599],
+                    [-2.9563985, -1.6993835, 1.7972052],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/test_modeling_tf_funnel.py b/test_modeling_tf_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..094f1af0796974ad40ae3b1fa88d00c2eb77e890
--- /dev/null
+++ b/test_modeling_tf_funnel.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import FunnelConfig, is_tf_available
+from transformers.testing_utils import require_tf
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFFunnelBaseModel,
+        TFFunnelForMaskedLM,
+        TFFunnelForMultipleChoice,
+        TFFunnelForPreTraining,
+        TFFunnelForQuestionAnswering,
+        TFFunnelForSequenceClassification,
+        TFFunnelForTokenClassification,
+        TFFunnelModel,
+    )
+
+
+class TFFunnelModelTester:
+    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester"""
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        block_sizes=[1, 1, 2],
+        num_decoder_layers=1,
+        d_model=32,
+        n_head=4,
+        d_head=8,
+        d_inner=37,
+        hidden_act="gelu_new",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=3,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        base=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.block_sizes = block_sizes
+        self.num_decoder_layers = num_decoder_layers
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = 2
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+        # Used in the tests to check the size of the first attention layer
+        self.num_attention_heads = n_head
+        # Used in the tests to check the size of the first hidden state
+        self.hidden_size = self.d_model
+        # Used in the tests to check the number of output hidden states/attentions
+        self.num_hidden_layers = sum(self.block_sizes) + (0 if base else self.num_decoder_layers)
+        # FunnelModel adds two hidden layers: input embeddings and the sum of the upsampled encoder hidden state with
+        # the last hidden state of the first block (which is the first hidden state of the decoder).
+        if not base:
+            self.expected_num_hidden_layers = self.num_hidden_layers + 2
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = FunnelConfig(
+            vocab_size=self.vocab_size,
+            block_sizes=self.block_sizes,
+            num_decoder_layers=self.num_decoder_layers,
+            d_model=self.d_model,
+            n_head=self.n_head,
+            d_head=self.d_head,
+            d_inner=self.d_inner,
+            hidden_act=self.hidden_act,
+            hidden_dropout=self.hidden_dropout,
+            attention_dropout=self.attention_dropout,
+            activation_dropout=self.activation_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = TFFunnelModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
+
+        config.truncate_seq = False
+        model = TFFunnelModel(config=config)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
+
+        config.separate_cls = False
+        model = TFFunnelModel(config=config)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
+
+    def create_and_check_base_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = TFFunnelBaseModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
+
+        config.truncate_seq = False
+        model = TFFunnelBaseModel(config=config)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 3, self.d_model))
+
+        config.separate_cls = False
+        model = TFFunnelBaseModel(config=config)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
+
+    def create_and_check_for_pretraining(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = TFFunnelForPreTraining(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_masked_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = TFFunnelForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_sequence_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = TFFunnelForSequenceClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        config.num_choices = self.num_choices
+        model = TFFunnelForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = TFFunnelForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_question_answering(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = TFFunnelForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFFunnelModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            TFFunnelModel,
+            TFFunnelForMaskedLM,
+            TFFunnelForPreTraining,
+            TFFunnelForQuestionAnswering,
+            TFFunnelForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFFunnelModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    def test_compile_tf_model(self):
+        # This test fails the CI. TODO Lysandre re-enable it
+        pass
+
+
+@require_tf
+class TFFunnelBaseModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TFFunnelBaseModel, TFFunnelForMultipleChoice, TFFunnelForSequenceClassification) if is_tf_available() else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFFunnelModelTester(self, base=True)
+        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_base_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_base_model(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
diff --git a/test_modeling_tf_gpt2.py b/test_modeling_tf_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e13f0fdc1c4eb39994c525c3a7fb2730338afb1
--- /dev/null
+++ b/test_modeling_tf_gpt2.py
@@ -0,0 +1,458 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import GPT2Config, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.gpt2.modeling_tf_gpt2 import (
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFGPT2DoubleHeadsModel,
+        TFGPT2ForSequenceClassification,
+        TFGPT2LMHeadModel,
+        TFGPT2Model,
+        shape_list,
+    )
+
+
+class TFGPT2ModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.bos_token_id = self.vocab_size - 1
+        self.eos_token_id = self.vocab_size - 1
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = GPT2Config(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            return_dict=True,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFGPT2Model(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+
+        inputs = [input_ids, None, input_mask]  # None is the input for 'past'
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFGPT2Model(config=config)
+
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
+        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
+
+        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
+
+    def create_and_check_gpt2_model_attention_mask_past(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = TFGPT2Model(config=config)
+
+        # create attention mask
+        half_seq_length = self.seq_length // 2
+        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
+        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
+        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
+
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
+        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
+        condition = tf.transpose(
+            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
+        )
+        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
+
+        # append to next input_ids and attn_mask
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1)
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
+
+    def create_and_check_gpt2_model_past_large_inputs(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = TFGPT2Model(config=config)
+
+        input_ids = input_ids[:1, :]
+        input_mask = input_mask[:1, :]
+        token_type_ids = token_type_ids[:1, :]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True)
+
+        output, past = outputs.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+        next_token_types = ids_tensor((self.batch_size, 3), self.type_vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1)
+        next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1)
+
+        output_from_no_past = model(
+            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
+        )["last_hidden_state"]
+        output_from_past = model(
+            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past=past
+        )["last_hidden_state"]
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+    def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFGPT2LMHeadModel(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_gpt2_double_head(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+    ):
+        model = TFGPT2DoubleHeadsModel(config=config)
+
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "mc_token_ids": mc_token_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
+        )
+        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_gpt2_for_sequence_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+    ):
+        config.num_labels = self.num_labels
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+            "labels": sequence_labels,
+        }
+        model = TFGPT2ForSequenceClassification(config)
+
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFGPT2ModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel)
+        if is_tf_available()
+        else ()
+    )
+    all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else ()
+    test_head_masking = False
+    test_onnx = True
+    onnx_min_opset = 10
+
+    def setUp(self):
+        self.model_tester = TFGPT2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_gpt2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
+
+    def test_gpt2_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
+
+    def test_gpt2_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
+
+    def test_gpt2_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs)
+
+    def test_gpt2_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
+
+    def test_gpt2_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_gpt2_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFGPT2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_gpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
+        input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32)  # The dog
+        expected_output_ids = [
+            464,
+            3290,
+            373,
+            1043,
+            287,
+            257,
+            2214,
+            1474,
+            262,
+            16246,
+            286,
+            2688,
+            290,
+            2688,
+            27262,
+            13,
+            198,
+            198,
+            464,
+            3290,
+        ]  # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
+
+    @slow
+    def test_lm_generate_distilgpt2(self):
+        model = TFGPT2LMHeadModel.from_pretrained("distilgpt2")
+        input_ids = tf.convert_to_tensor([[464, 1893]], dtype=tf.int32)  # The president
+        expected_output_ids = [
+            464,
+            1893,
+            286,
+            262,
+            1578,
+            1829,
+            11,
+            290,
+            262,
+            1893,
+            286,
+            262,
+            1578,
+            7526,
+            11,
+            423,
+            587,
+            287,
+            262,
+            2635,
+        ]  # The president of the United States, and the president of the United Kingdom, have been in the White
+
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_tf_layoutlm.py b/test_modeling_tf_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..119b6f6f04d558053154a462bff2cbdad1725129
--- /dev/null
+++ b/test_modeling_tf_layoutlm.py
@@ -0,0 +1,324 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import LayoutLMConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.layoutlm.modeling_tf_layoutlm import (
+        TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFLayoutLMForMaskedLM,
+        TFLayoutLMForSequenceClassification,
+        TFLayoutLMForTokenClassification,
+        TFLayoutLMModel,
+    )
+
+
+class TFLayoutLMModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        range_bbox=1000,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.range_bbox = range_bbox
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        # convert bbox to numpy since TF does not support item assignment
+        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox).numpy()
+        # Ensure that bbox is legal
+        for i in range(bbox.shape[0]):
+            for j in range(bbox.shape[1]):
+                if bbox[i, j, 3] < bbox[i, j, 1]:
+                    t = bbox[i, j, 3]
+                    bbox[i, j, 3] = bbox[i, j, 1]
+                    bbox[i, j, 1] = t
+                if bbox[i, j, 2] < bbox[i, j, 0]:
+                    t = bbox[i, j, 2]
+                    bbox[i, j, 2] = bbox[i, j, 0]
+                    bbox[i, j, 0] = t
+        bbox = tf.convert_to_tensor(bbox)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = LayoutLMConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_model(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFLayoutLMModel(config=config)
+
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, bbox, token_type_ids=token_type_ids)
+        result = model(input_ids, bbox)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFLayoutLMForMaskedLM(config=config)
+
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFLayoutLMForSequenceClassification(config=config)
+
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFLayoutLMForTokenClassification(config=config)
+
+        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            bbox,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "bbox": bbox,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class LayoutLMModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (TFLayoutLMModel, TFLayoutLMForMaskedLM, TFLayoutLMForTokenClassification, TFLayoutLMForSequenceClassification)
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = True
+    onnx_min_opset = 10
+
+    def setUp(self):
+        self.model_tester = TFLayoutLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFLayoutLMModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+def prepare_layoutlm_batch_inputs():
+    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
+    # fmt: off
+    input_ids = tf.convert_to_tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]])  # noqa: E231
+    attention_mask = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],])  # noqa: E231
+    bbox = tf.convert_to_tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]])  # noqa: E231
+    token_type_ids = tf.convert_to_tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])  # noqa: E231
+    # these are sequence labels (i.e. at the token level)
+    labels = tf.convert_to_tensor([[-100,10,10,10,9,1,-100,7,7,-100,7,7,4,2,5,2,8,8,-100,-100,5,0,3,2,-100],[-100,12,12,12,-100,12,10,-100,-100,-100,-100,10,12,9,-100,-100,-100,10,10,10,9,12,-100,10,-100]])  # noqa: E231
+    # fmt: on
+
+    return input_ids, attention_mask, bbox, token_type_ids, labels
+
+
+@require_tf
+class TFLayoutLMModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_forward_pass_no_head(self):
+        model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
+
+        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
+
+        # forward pass
+        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+
+        # test the sequence output on [0, :3, :3]
+        expected_slice = tf.convert_to_tensor(
+            [[0.1785, -0.1947, -0.0425], [-0.3254, -0.2807, 0.2553], [-0.5391, -0.3322, 0.3364]],
+        )
+
+        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-3))
+
+        # test the pooled output on [1, :3]
+        expected_slice = tf.convert_to_tensor([-0.6580, -0.0214, 0.8552])
+
+        self.assertTrue(np.allclose(outputs.pooler_output[1, :3], expected_slice, atol=1e-3))
+
+    @slow
+    def test_forward_pass_sequence_classification(self):
+        # initialize model with randomly initialized sequence classification head
+        model = TFLayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=2)
+
+        input_ids, attention_mask, bbox, token_type_ids, _ = prepare_layoutlm_batch_inputs()
+
+        # forward pass
+        outputs = model(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=tf.convert_to_tensor([1, 1]),
+        )
+
+        # test whether we get a loss as a scalar
+        loss = outputs.loss
+        expected_shape = (2,)
+        self.assertEqual(loss.shape, expected_shape)
+
+        # test the shape of the logits
+        logits = outputs.logits
+        expected_shape = (2, 2)
+        self.assertEqual(logits.shape, expected_shape)
+
+    @slow
+    def test_forward_pass_token_classification(self):
+        # initialize model with randomly initialized token classification head
+        model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=13)
+
+        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
+
+        # forward pass
+        outputs = model(
+            input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels
+        )
+
+        # test the shape of the logits
+        logits = outputs.logits
+        expected_shape = tf.convert_to_tensor((2, 25, 13))
+        self.assertEqual(logits.shape, expected_shape)
diff --git a/test_modeling_tf_led.py b/test_modeling_tf_led.py
new file mode 100644
index 0000000000000000000000000000000000000000..41d132c80b3db6c1ed78b0d59e1cd512614e0be0
--- /dev/null
+++ b/test_modeling_tf_led.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import LEDConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFLEDForConditionalGeneration, TFLEDModel
+
+
+@require_tf
+class TFLEDModelTester:
+    config_cls = LEDConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        attention_window=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.attention_window = attention_window
+
+        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
+        # [num_attention_heads, encoder_seq_length, encoder_key_length], but TFLongformerSelfAttention
+        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
+        # because its local attention only attends to `self.attention_window` and one before and one after
+        self.key_length = self.attention_window + 2
+
+        # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
+        # the `test_attention_outputs` and `test_hidden_states_output` tests
+        self.encoder_seq_length = (
+            self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            attention_window=self.attention_window,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
+        global_attention_mask = tf.concat(
+            [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]],
+            axis=-1,
+        )
+        inputs_dict["global_attention_mask"] = global_attention_mask
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFLEDModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_led_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "decoder_input_ids": decoder_input_ids,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+    }
+
+
+@require_tf
+class TFLEDModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFLEDForConditionalGeneration, TFLEDModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFLEDForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFLEDModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LEDConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        inputs_dict["global_attention_mask"] = tf.zeros_like(inputs_dict["attention_mask"])
+        num_global_attn_indices = 2
+        inputs_dict["global_attention_mask"] = tf.where(
+            tf.range(self.model_tester.seq_length)[None, :] < num_global_attn_indices,
+            1,
+            inputs_dict["global_attention_mask"],
+        )
+
+        config.return_dict = True
+        seq_length = self.model_tester.seq_length
+        encoder_seq_length = self.model_tester.encoder_seq_length
+
+        def check_decoder_attentions_output(outputs):
+            decoder_attentions = outputs.decoder_attentions
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+
+        def check_encoder_attentions_output(outputs):
+            attentions = [t.numpy() for t in outputs.encoder_attentions]
+            global_attentions = [t.numpy() for t in outputs.encoder_global_attentions]
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertEqual(len(global_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, seq_length],
+            )
+            self.assertListEqual(
+                list(global_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, num_global_attn_indices],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["use_cache"] = False
+            config.output_hidden_states = False
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            out_len = len(outputs)
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            if self.is_encoder_decoder:
+                model = model_class(config)
+                outputs = model(self._prepare_for_class(inputs_dict, model_class))
+                self.assertEqual(config.output_hidden_states, False)
+                check_decoder_attentions_output(outputs)
+
+            # Check that output attentions can also be changed via the config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            self.assertEqual(config.output_hidden_states, False)
+            check_encoder_attentions_output(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            config.output_hidden_states = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
+            self.assertEqual(model.config.output_hidden_states, True)
+            check_encoder_attentions_output(outputs)
+
+    def test_xla_mode(self):
+        # TODO JP: Make LED XLA compliant
+        pass
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    def test_generate_with_headmasking(self):
+        # TODO: Head-masking not yet implement
+        pass
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+TOLERANCE = 1e-4
+
+
+@slow
+@require_tf
+class TFLEDModelIntegrationTest(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = TFLEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").led
+
+        # change to intended input here
+        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
+        output = model(**inputs_dict)[0]
+        expected_shape = (1, 1024, 768)
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = tf.convert_to_tensor(
+            [[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]],
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
+
+    def test_inference_with_head(self):
+        model = TFLEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
+
+        # change to intended input here
+        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
+        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
+        output = model(**inputs_dict)[0]
+        expected_shape = (1, 1024, model.config.vocab_size)
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = tf.convert_to_tensor(
+            [[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]],
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
diff --git a/test_modeling_tf_longformer.py b/test_modeling_tf_longformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b88437a1373fa5995f70937f83e7c89196c5f9f3
--- /dev/null
+++ b/test_modeling_tf_longformer.py
@@ -0,0 +1,709 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        LongformerConfig,
+        TFLongformerForMaskedLM,
+        TFLongformerForMultipleChoice,
+        TFLongformerForQuestionAnswering,
+        TFLongformerForSequenceClassification,
+        TFLongformerForTokenClassification,
+        TFLongformerModel,
+        TFLongformerSelfAttention,
+    )
+
+    def shape_list(x):
+        """
+        copied from transformers.modeling_tf_utils
+        """
+        static = x.shape.as_list()
+        dynamic = tf.shape(x)
+        return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+
+
+class TFLongformerModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.attention_window = 4
+
+        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
+        # [num_attention_heads, encoder_seq_length, encoder_key_length], but TFLongformerSelfAttention
+        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
+        # because its local attention only attends to `self.attention_window` and one before and one after
+        self.key_length = self.attention_window + 2
+
+        # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
+        # the `test_attention_outputs` and `test_hidden_states_output` tests
+        self.encoder_seq_length = (
+            self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window
+        )
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = LongformerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            attention_window=self.attention_window,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_attention_mask_determinism(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFLongformerModel(config=config)
+
+        attention_mask = tf.ones(input_ids.shape, dtype=tf.dtypes.int32)
+        output_with_mask = model(input_ids, attention_mask=attention_mask)[0]
+        output_without_mask = model(input_ids)[0]
+        tf.debugging.assert_near(output_with_mask[0, 0, :5], output_without_mask[0, 0, :5], rtol=1e-4)
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.return_dict = True
+        model = TFLongformerModel(config=config)
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertListEqual(
+            shape_list(result.last_hidden_state), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(shape_list(result.pooler_output), [self.batch_size, self.hidden_size])
+
+    def create_and_check_model_with_global_attention_mask(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.return_dict = True
+        model = TFLongformerModel(config=config)
+        half_input_mask_length = shape_list(input_mask)[-1] // 2
+        global_attention_mask = tf.concat(
+            [
+                tf.zeros_like(input_mask)[:, :half_input_mask_length],
+                tf.ones_like(input_mask)[:, half_input_mask_length:],
+            ],
+            axis=-1,
+        )
+
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            global_attention_mask=global_attention_mask,
+            token_type_ids=token_type_ids,
+        )
+        result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask)
+        result = model(input_ids, global_attention_mask=global_attention_mask)
+
+        self.parent.assertListEqual(
+            shape_list(result.last_hidden_state), [self.batch_size, self.seq_length, self.hidden_size]
+        )
+        self.parent.assertListEqual(shape_list(result.pooler_output), [self.batch_size, self.hidden_size])
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.return_dict = True
+        model = TFLongformerForMaskedLM(config=config)
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertListEqual(shape_list(result.logits), [self.batch_size, self.seq_length, self.vocab_size])
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.return_dict = True
+        model = TFLongformerForQuestionAnswering(config=config)
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+
+        self.parent.assertListEqual(shape_list(result.start_logits), [self.batch_size, self.seq_length])
+        self.parent.assertListEqual(shape_list(result.end_logits), [self.batch_size, self.seq_length])
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFLongformerForSequenceClassification(config=config)
+        output = model(
+            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
+        ).logits
+        self.parent.assertListEqual(shape_list(output), [self.batch_size, self.num_labels])
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFLongformerForTokenClassification(config=config)
+        output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels).logits
+        self.parent.assertListEqual(shape_list(output), [self.batch_size, self.seq_length, self.num_labels])
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFLongformerForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        output = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            global_attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        ).logits
+        self.parent.assertListEqual(list(output.shape), [self.batch_size, self.num_choices])
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        # global attention mask has to be partly defined
+        # to trace all weights
+        global_attention_mask = tf.concat(
+            [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]],
+            axis=-1,
+        )
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+            "global_attention_mask": global_attention_mask,
+        }
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_question_answering(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        # Replace sep_token_id by some random id
+        input_ids = tf.where(input_ids == config.sep_token_id, 0, input_ids)
+        # Make sure there are exactly three sep_token_id
+        input_ids = tf.concat([input_ids[:, :-3], tf.ones_like(input_ids)[:, -3:] * config.sep_token_id], axis=-1)
+        input_mask = tf.ones_like(input_ids)
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+
+@require_tf
+class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFLongformerModel,
+            TFLongformerForMaskedLM,
+            TFLongformerForQuestionAnswering,
+            TFLongformerForSequenceClassification,
+            TFLongformerForMultipleChoice,
+            TFLongformerForTokenClassification,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFLongformerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model_attention_mask_determinism(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_attention_mask_determinism(*config_and_inputs)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_global_attention_mask(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_global_attention_mask(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_question_answering()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    def test_xla_mode(self):
+        # TODO JP: Make Longformer XLA compliant
+        pass
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFLongformerModelIntegrationTest(unittest.TestCase):
+    def _get_hidden_states(self):
+        return tf.convert_to_tensor(
+            [
+                [
+                    [
+                        4.98332758e-01,
+                        2.69175139e00,
+                        -7.08081422e-03,
+                        1.04915401e00,
+                        -1.83476661e00,
+                        7.67220476e-01,
+                        2.98580543e-01,
+                        2.84803992e-02,
+                    ],
+                    [
+                        -7.58357372e-01,
+                        4.20635998e-01,
+                        -4.04739919e-02,
+                        1.59924145e-01,
+                        2.05135748e00,
+                        -1.15997978e00,
+                        5.37166397e-01,
+                        2.62873606e-01,
+                    ],
+                    [
+                        -1.69438001e00,
+                        4.17574660e-01,
+                        -1.49196962e00,
+                        -1.76483717e00,
+                        -1.94566312e-01,
+                        -1.71183858e00,
+                        7.72903565e-01,
+                        -1.11557056e00,
+                    ],
+                    [
+                        5.44028163e-01,
+                        2.05466114e-01,
+                        -3.63045868e-01,
+                        2.41865062e-01,
+                        3.20348382e-01,
+                        -9.05611176e-01,
+                        -1.92690727e-01,
+                        -1.19917547e00,
+                    ],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+
+    def test_diagonalize(self):
+        hidden_states = self._get_hidden_states()
+        hidden_states = tf.reshape(hidden_states, (1, 8, 4))  # set seq length = 8, hidden dim = 4
+        chunked_hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+        window_overlap_size = shape_list(chunked_hidden_states)[2]
+        self.assertTrue(window_overlap_size == 4)
+
+        padded_hidden_states = TFLongformerSelfAttention._pad_and_diagonalize(chunked_hidden_states)
+
+        self.assertTrue(
+            shape_list(padded_hidden_states)[-1] == shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1
+        )
+
+        # first row => [0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000]
+        tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], rtol=1e-3)
+        tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3)
+
+        # last row => [0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629]
+        tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], rtol=1e-3)
+        tf.debugging.assert_near(
+            padded_hidden_states[0, 0, -1, :3], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3
+        )
+
+    def test_pad_and_transpose_last_two_dims(self):
+        hidden_states = self._get_hidden_states()
+        self.assertTrue(shape_list(hidden_states), [1, 8, 4])
+
+        # pad along seq length dim
+        paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32)
+
+        hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+        padded_hidden_states = TFLongformerSelfAttention._pad_and_transpose_last_two_dims(hidden_states, paddings)
+        self.assertTrue(shape_list(padded_hidden_states) == [1, 1, 8, 5])
+
+        expected_added_dim = tf.zeros((5,), dtype=tf.dtypes.float32)
+        tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6)
+        tf.debugging.assert_near(
+            hidden_states[0, 0, -1, :], tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6
+        )
+
+    def test_mask_invalid_locations(self):
+        hidden_states = self._get_hidden_states()
+        batch_size = 1
+        seq_length = 8
+        hidden_size = 4
+        hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size))
+        hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+
+        hid_states_1 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states, 1)
+        hid_states_2 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states, 2)
+        hid_states_3 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states[:, :, :, :3], 2)
+        hid_states_4 = TFLongformerSelfAttention._mask_invalid_locations(hidden_states[:, :, 2:, :], 2)
+
+        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_1), tf.dtypes.int32)) == 8)
+        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_2), tf.dtypes.int32)) == 24)
+        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_3), tf.dtypes.int32)) == 24)
+        self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_4), tf.dtypes.int32)) == 12)
+
+    def test_chunk(self):
+        hidden_states = self._get_hidden_states()
+        batch_size = 1
+        seq_length = 8
+        hidden_size = 4
+        hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size))
+
+        chunked_hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2)
+
+        # expected slices across chunk and seq length dim
+        expected_slice_along_seq_length = tf.convert_to_tensor([0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32)
+        expected_slice_along_chunk = tf.convert_to_tensor([0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32)
+
+        self.assertTrue(shape_list(chunked_hidden_states) == [1, 3, 4, 4])
+        tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3)
+        tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3)
+
+    def test_layer_local_attn(self):
+        model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
+        layer = model.longformer.encoder.layer[0].attention.self_attention
+        hidden_states = self._get_hidden_states()
+        batch_size, seq_length, hidden_size = hidden_states.shape
+
+        attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.dtypes.float32)
+        is_index_global_attn = tf.math.greater(attention_mask, 1)
+        is_global_attn = tf.math.reduce_any(is_index_global_attn)
+
+        attention_mask = tf.where(tf.range(4)[None, :, None, None] > 1, -10000.0, attention_mask[:, :, None, None])
+        is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
+
+        layer_head_mask = None
+
+        output_hidden_states = layer(
+            [hidden_states, attention_mask, layer_head_mask, is_index_masked, is_index_global_attn, is_global_attn]
+        )[0]
+
+        expected_slice = tf.convert_to_tensor(
+            [0.00188, 0.012196, -0.017051, -0.025571, -0.02996, 0.017297, -0.011521, 0.004848], dtype=tf.dtypes.float32
+        )
+
+        self.assertTrue(output_hidden_states.shape, (1, 4, 8))
+        tf.debugging.assert_near(output_hidden_states[0, 1], expected_slice, rtol=1e-3)
+
+    def test_layer_global_attn(self):
+        model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
+        layer = model.longformer.encoder.layer[0].attention.self_attention
+        hidden_states = self._get_hidden_states()
+
+        hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0)
+        batch_size, seq_length, hidden_size = hidden_states.shape
+
+        # create attn mask
+        attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
+        attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
+
+        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 1, 10000.0, attention_mask_1)
+        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1)
+        attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] > 0, 10000.0, attention_mask_2)
+        attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0)
+
+        is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
+        is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0)
+        is_global_attn = tf.math.reduce_any(is_index_global_attn)
+
+        layer_head_mask = None
+
+        output_hidden_states = layer(
+            [
+                hidden_states,
+                -tf.math.abs(attention_mask),
+                layer_head_mask,
+                is_index_masked,
+                is_index_global_attn,
+                is_global_attn,
+            ]
+        )[0]
+
+        self.assertTrue(output_hidden_states.shape, (2, 4, 8))
+        expected_slice_0 = tf.convert_to_tensor(
+            [-0.06508, -0.039306, 0.030934, -0.03417, -0.00656, -0.01553, -0.02088, -0.04938], dtype=tf.dtypes.float32
+        )
+
+        expected_slice_1 = tf.convert_to_tensor(
+            [-0.04055, -0.038399, 0.0396, -0.03735, -0.03415, 0.01357, 0.00145, -0.05709], dtype=tf.dtypes.float32
+        )
+
+        tf.debugging.assert_near(output_hidden_states[0, 2], expected_slice_0, rtol=1e-3)
+        tf.debugging.assert_near(output_hidden_states[1, -2], expected_slice_1, rtol=1e-3)
+
+    def test_layer_attn_probs(self):
+        model = TFLongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
+        layer = model.longformer.encoder.layer[0].attention.self_attention
+        hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0)
+        batch_size, seq_length, hidden_size = hidden_states.shape
+
+        # create attn mask
+        attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
+        attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
+
+        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 1, 10000.0, attention_mask_1)
+        attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1)
+        attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] > 0, 10000.0, attention_mask_2)
+        attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0)
+
+        is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
+        is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0)
+        is_global_attn = tf.math.reduce_any(is_index_global_attn)
+
+        layer_head_mask = None
+
+        output_hidden_states, local_attentions, global_attentions = layer(
+            [
+                hidden_states,
+                -tf.math.abs(attention_mask),
+                layer_head_mask,
+                is_index_masked,
+                is_index_global_attn,
+                is_global_attn,
+            ]
+        )
+
+        self.assertEqual(local_attentions.shape, (2, 4, 2, 8))
+        self.assertEqual(global_attentions.shape, (2, 2, 3, 4))
+
+        self.assertTrue((local_attentions[0, 2:4, :, :] == 0).numpy().tolist())
+        self.assertTrue((local_attentions[1, 1:4, :, :] == 0).numpy().tolist())
+
+        #
+        # The weight of all tokens with local attention must sum to 1.
+        self.assertTrue(
+            (tf.math.abs(tf.math.reduce_sum(global_attentions[0, :, :2, :], axis=-1) - 1) < 1e-6).numpy().tolist()
+        )
+        self.assertTrue(
+            (tf.math.abs(tf.math.reduce_sum(global_attentions[1, :, :1, :], axis=-1) - 1) < 1e-6).numpy().tolist()
+        )
+
+        tf.debugging.assert_near(
+            local_attentions[0, 0, 0, :],
+            tf.convert_to_tensor(
+                [0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000], dtype=tf.dtypes.float32
+            ),
+            rtol=1e-3,
+        )
+
+        tf.debugging.assert_near(
+            local_attentions[1, 0, 0, :],
+            tf.convert_to_tensor(
+                [0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000], dtype=tf.dtypes.float32
+            ),
+            rtol=1e-3,
+        )
+
+        # All the global attention weights must sum to 1.
+        self.assertTrue((tf.math.abs(tf.math.reduce_sum(global_attentions, axis=-1) - 1) < 1e-6).numpy().tolist())
+
+        tf.debugging.assert_near(
+            global_attentions[0, 0, 1, :],
+            tf.convert_to_tensor([0.2500, 0.2500, 0.2500, 0.2500], dtype=tf.dtypes.float32),
+            rtol=1e-3,
+        )
+        tf.debugging.assert_near(
+            global_attentions[1, 0, 0, :],
+            tf.convert_to_tensor([0.2497, 0.2500, 0.2499, 0.2504], dtype=tf.dtypes.float32),
+            rtol=1e-3,
+        )
+
+    @slow
+    def test_inference_no_head(self):
+        model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
+
+        # 'Hello world!'
+        input_ids = tf.convert_to_tensor([[0, 20920, 232, 328, 1437, 2]], dtype=tf.dtypes.int32)
+        attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32)
+
+        output = model(input_ids, attention_mask=attention_mask)[0]
+        output_without_mask = model(input_ids)[0]
+
+        expected_output_slice = tf.convert_to_tensor(
+            [0.0549, 0.1087, -0.1119, -0.0368, 0.0250], dtype=tf.dtypes.float32
+        )
+
+        tf.debugging.assert_near(output[0, 0, -5:], expected_output_slice, rtol=1e-3)
+        tf.debugging.assert_near(output_without_mask[0, 0, -5:], expected_output_slice, rtol=1e-3)
+
+    @slow
+    def test_inference_no_head_long(self):
+        model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
+
+        # 'Hello world! ' repeated 1000 times
+        input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32)
+
+        attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32)
+        global_attention_mask = tf.zeros(shape_list(input_ids), dtype=tf.dtypes.int32)
+        # Set global attention on a few random positions
+        global_attention_mask = tf.tensor_scatter_nd_update(
+            global_attention_mask, tf.constant([[0, 1], [0, 4], [0, 21]]), tf.constant([1, 1, 1])
+        )
+
+        output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0]
+
+        expected_output_sum = tf.constant(74585.875)
+        expected_output_mean = tf.constant(0.024267)
+
+        # assert close
+        tf.debugging.assert_near(tf.reduce_sum(output), expected_output_sum, rtol=1e-4)
+        tf.debugging.assert_near(tf.reduce_mean(output), expected_output_mean, rtol=1e-4)
+
+    @slow
+    def test_inference_masked_lm_long(self):
+        model = TFLongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+
+        # 'Hello world! ' repeated 1000 times
+        input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32)
+
+        output = model(input_ids, labels=input_ids)
+        loss = output.loss
+        prediction_scores = output.logits
+
+        expected_loss = tf.constant(0.0073798)
+        expected_prediction_scores_sum = tf.constant(-610476600.0)
+        expected_prediction_scores_mean = tf.constant(-3.03477)
+
+        # assert close
+        tf.debugging.assert_near(tf.reduce_mean(loss), expected_loss, rtol=1e-4)
+        tf.debugging.assert_near(tf.reduce_sum(prediction_scores), expected_prediction_scores_sum, rtol=1e-4)
+        tf.debugging.assert_near(tf.reduce_mean(prediction_scores), expected_prediction_scores_mean, rtol=1e-4)
+
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFLongformerForMaskedLM.from_pretrained("lysandre/tiny-longformer-random")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 10]
+        self.assertEqual(output.shape, expected_shape)
+
+        print(output[:, :3, :3])
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [-0.04926379, 0.0367098, 0.02099686],
+                    [0.03940692, 0.01547744, -0.01448723],
+                    [0.03495252, -0.05900355, -0.01675752],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_lxmert.py b/test_modeling_tf_lxmert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b3187eb2d4a5cbd90a230906316012d6655ef6d
--- /dev/null
+++ b/test_modeling_tf_lxmert.py
@@ -0,0 +1,770 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+from transformers import LxmertConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.lxmert.modeling_tf_lxmert import TFLxmertForPreTraining, TFLxmertModel
+
+
+class TFLxmertModelTester(object):
+    def __init__(
+        self,
+        parent,
+        vocab_size=300,
+        hidden_size=28,
+        num_attention_heads=2,
+        num_labels=2,
+        intermediate_size=64,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        num_qa_labels=30,
+        num_object_labels=16,
+        num_attr_labels=4,
+        num_visual_features=10,
+        l_layers=2,
+        x_layers=1,
+        r_layers=1,
+        visual_feat_dim=128,
+        visual_pos_dim=4,
+        visual_loss_normalizer=6.67,
+        seq_length=20,
+        batch_size=8,
+        is_training=True,
+        task_matched=True,
+        task_mask_lm=True,
+        task_obj_predict=True,
+        task_qa=True,
+        visual_obj_loss=True,
+        visual_attr_loss=True,
+        visual_feat_loss=True,
+        use_token_type_ids=True,
+        use_lang_mask=True,
+        output_attentions=False,
+        output_hidden_states=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_labels = num_labels
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.pad_token_id = pad_token_id
+        self.num_qa_labels = num_qa_labels
+        self.num_object_labels = num_object_labels
+        self.num_attr_labels = num_attr_labels
+        self.l_layers = l_layers
+        self.x_layers = x_layers
+        self.r_layers = r_layers
+        self.visual_feat_dim = visual_feat_dim
+        self.visual_pos_dim = visual_pos_dim
+        self.visual_loss_normalizer = visual_loss_normalizer
+        self.seq_length = seq_length
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.use_lang_mask = use_lang_mask
+        self.task_matched = task_matched
+        self.task_mask_lm = task_mask_lm
+        self.task_obj_predict = task_obj_predict
+        self.task_qa = task_qa
+        self.visual_obj_loss = visual_obj_loss
+        self.visual_attr_loss = visual_attr_loss
+        self.visual_feat_loss = visual_feat_loss
+        self.num_visual_features = num_visual_features
+        self.use_token_type_ids = use_token_type_ids
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.scope = scope
+        self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
+
+    def prepare_config_and_inputs(self):
+        output_attentions = self.output_attentions
+        input_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=self.vocab_size)
+        visual_feats = tf.random.uniform((self.batch_size, self.num_visual_features, self.visual_feat_dim))
+        bounding_boxes = tf.random.uniform((self.batch_size, self.num_visual_features, 4))
+
+        input_mask = None
+        if self.use_lang_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        obj_labels = None
+        if self.task_obj_predict:
+            obj_labels = {}
+        if self.visual_attr_loss and self.task_obj_predict:
+            obj_labels["attr"] = (
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_attr_labels),
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_attr_labels),
+            )
+        if self.visual_feat_loss and self.task_obj_predict:
+            obj_labels["feat"] = (
+                ids_tensor(
+                    [self.batch_size, self.num_visual_features, self.visual_feat_dim], self.num_visual_features
+                ),
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_visual_features),
+            )
+        if self.visual_obj_loss and self.task_obj_predict:
+            obj_labels["obj"] = (
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_object_labels),
+                ids_tensor([self.batch_size, self.num_visual_features], self.num_object_labels),
+            )
+        ans = None
+        if self.task_qa:
+            ans = ids_tensor([self.batch_size], self.num_qa_labels)
+        masked_lm_labels = None
+        if self.task_mask_lm:
+            masked_lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        matched_label = None
+        if self.task_matched:
+            matched_label = ids_tensor([self.batch_size], self.num_labels)
+
+        config = LxmertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_attention_heads=self.num_attention_heads,
+            num_labels=self.num_labels,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            layer_norm_eps=self.layer_norm_eps,
+            pad_token_id=self.pad_token_id,
+            num_qa_labels=self.num_qa_labels,
+            num_object_labels=self.num_object_labels,
+            num_attr_labels=self.num_attr_labels,
+            l_layers=self.l_layers,
+            x_layers=self.x_layers,
+            r_layers=self.r_layers,
+            visual_feat_dim=self.visual_feat_dim,
+            visual_pos_dim=self.visual_pos_dim,
+            visual_loss_normalizer=self.visual_loss_normalizer,
+            task_matched=self.task_matched,
+            task_mask_lm=self.task_mask_lm,
+            task_obj_predict=self.task_obj_predict,
+            task_qa=self.task_qa,
+            visual_obj_loss=self.visual_obj_loss,
+            visual_attr_loss=self.visual_attr_loss,
+            visual_feat_loss=self.visual_feat_loss,
+            output_attentions=self.output_attentions,
+            output_hidden_states=self.output_hidden_states,
+        )
+
+        return (
+            config,
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids,
+            input_mask,
+            obj_labels,
+            masked_lm_labels,
+            matched_label,
+            ans,
+            output_attentions,
+        )
+
+    def create_and_check_lxmert_model(
+        self,
+        config,
+        input_ids,
+        visual_feats,
+        bounding_boxes,
+        token_type_ids,
+        input_mask,
+        obj_labels,
+        masked_lm_labels,
+        matched_label,
+        ans,
+        output_attentions,
+    ):
+        model = TFLxmertModel(config=config)
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            output_attentions=output_attentions,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            output_attentions=not output_attentions,
+        )
+        result = model(input_ids, visual_feats, bounding_boxes, return_dict=False)
+        result = model(input_ids, visual_feats, bounding_boxes, return_dict=True)
+
+        self.parent.assertEqual(result.language_output.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(
+            result.vision_output.shape, (self.batch_size, self.num_visual_features, self.hidden_size)
+        )
+        self.parent.assertEqual(result.pooled_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self, return_obj_labels=False):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids,
+            input_mask,
+            obj_labels,
+            masked_lm_labels,
+            matched_label,
+            ans,
+            output_attentions,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "visual_feats": visual_feats,
+            "visual_pos": bounding_boxes,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+
+        if return_obj_labels:
+            inputs_dict["obj_labels"] = obj_labels
+
+        return config, inputs_dict
+
+    def create_and_check_lxmert_for_pretraining(
+        self,
+        config,
+        input_ids,
+        visual_feats,
+        bounding_boxes,
+        token_type_ids,
+        input_mask,
+        obj_labels,
+        masked_lm_labels,
+        matched_label,
+        ans,
+        output_attentions,
+    ):
+        model = TFLxmertForPreTraining(config=config)
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+            obj_labels=obj_labels,
+            matched_label=matched_label,
+            ans=ans,
+            output_attentions=output_attentions,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+            output_attentions=not output_attentions,
+            return_dict=False,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            obj_labels=obj_labels,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            matched_label=matched_label,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            ans=ans,
+        )
+        result = model(
+            input_ids,
+            visual_feats,
+            bounding_boxes,
+            token_type_ids=token_type_ids,
+            attention_mask=input_mask,
+            masked_lm_labels=masked_lm_labels,
+            obj_labels=obj_labels,
+            matched_label=matched_label,
+            ans=ans,
+            output_attentions=not output_attentions,
+        )
+
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+
+@require_tf
+class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (TFLxmertModel, TFLxmertForPreTraining) if is_tf_available() else ()
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFLxmertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LxmertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_lxmert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lxmert_model(*config_and_inputs)
+
+    def test_lxmert_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lxmert_for_pretraining(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ["unc-nlp/lxmert-base-uncased"]:
+            model = TFLxmertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        encoder_seq_length = (
+            self.model_tester.encoder_seq_length
+            if hasattr(self.model_tester, "encoder_seq_length")
+            else self.model_tester.seq_length
+        )
+        encoder_key_length = (
+            self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length
+        )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
+
+            self.assertEqual(model.config.output_hidden_states, False)
+
+            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
+            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
+
+            attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
+            attention_shapes = [
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_visual_features,
+                    self.model_tester.num_visual_features,
+                ],
+                [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
+            ]
+
+            for attention, attention_shape in zip(attentions, attention_shapes):
+                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            # 2 hidden states were added
+            self.assertEqual(out_len + 2, len(outputs))
+            language_attentions, vision_attentions, cross_encoder_attentions = (outputs[-3], outputs[-2], outputs[-1])
+            self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
+            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
+            self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
+
+            attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
+            attention_shapes = [
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                [
+                    self.model_tester.num_attention_heads,
+                    self.model_tester.num_visual_features,
+                    self.model_tester.num_visual_features,
+                ],
+                [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
+            ]
+
+            for attention, attention_shape in zip(attentions, attention_shapes):
+                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_hidden_states_output(config, inputs_dict, model_class):
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1]
+
+            self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
+            self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
+
+            seq_length = self.model_tester.seq_length
+            num_visual_features = self.model_tester.num_visual_features
+
+            self.assertListEqual(
+                list(language_hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+            self.assertListEqual(
+                list(vision_hidden_states[0].shape[-2:]),
+                [num_visual_features, self.model_tester.hidden_size],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+    def test_pt_tf_model_equivalence(self):
+        from transformers import is_torch_available
+
+        if not is_torch_available():
+            return
+
+        import torch
+
+        import transformers
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
+                return_obj_labels="PreTraining" in model_class.__name__
+            )
+
+            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
+            pt_model_class = getattr(transformers, pt_model_class_name)
+
+            config.output_hidden_states = True
+            config.task_obj_predict = False
+
+            tf_model = model_class(config)
+            pt_model = pt_model_class(config)
+
+            # Check we can load pt model in tf and vice-versa with model => model functions
+
+            tf_model = transformers.load_pytorch_model_in_tf2_model(
+                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
+            )
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+
+            # Delete obj labels as we want to compute the hidden states and not the loss
+
+            if "obj_labels" in inputs_dict:
+                del inputs_dict["obj_labels"]
+
+            def torch_type(key):
+                if key in ("visual_feats", "visual_pos"):
+                    return torch.float32
+                else:
+                    return torch.long
+
+            def recursive_numpy_convert(iterable):
+                return_dict = {}
+                for key, value in iterable.items():
+                    if isinstance(value, dict):
+                        return_dict[key] = recursive_numpy_convert(value)
+                    else:
+                        if isinstance(value, (list, tuple)):
+                            return_dict[key] = (
+                                torch.from_numpy(iter_value.numpy()).to(torch_type(key)) for iter_value in value
+                            )
+                        else:
+                            return_dict[key] = torch.from_numpy(value.numpy()).to(torch_type(key))
+                return return_dict
+
+            pt_inputs_dict = recursive_numpy_convert(self._prepare_for_class(inputs_dict, model_class))
+
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)
+            tf_hidden_states = tfo[0].numpy()
+            pt_hidden_states = pto[0].numpy()
+
+            import numpy as np
+
+            tf_nans = np.copy(np.isnan(tf_hidden_states))
+            pt_nans = np.copy(np.isnan(pt_hidden_states))
+
+            pt_hidden_states[tf_nans] = 0
+            tf_hidden_states[tf_nans] = 0
+            pt_hidden_states[pt_nans] = 0
+            tf_hidden_states[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
+            # Debug info (remove when fixed)
+            if max_diff >= 2e-2:
+                print("===")
+                print(model_class)
+                print(config)
+                print(inputs_dict)
+                print(pt_inputs_dict)
+            self.assertLessEqual(max_diff, 6e-2)
+
+            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                import os
+
+                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
+                torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
+                tf_model.save_weights(tf_checkpoint_path)
+                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+            pt_model.eval()
+            pt_inputs_dict = dict(
+                (name, torch.from_numpy(key.numpy()).to(torch.long))
+                for name, key in self._prepare_for_class(inputs_dict, model_class).items()
+            )
+
+            for key, value in pt_inputs_dict.items():
+                if key in ("visual_feats", "visual_pos"):
+                    pt_inputs_dict[key] = value.to(torch.float32)
+                else:
+                    pt_inputs_dict[key] = value.to(torch.long)
+
+            with torch.no_grad():
+                pto = pt_model(**pt_inputs_dict)
+            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))
+            tfo = tfo[0].numpy()
+            pto = pto[0].numpy()
+            tf_nans = np.copy(np.isnan(tfo))
+            pt_nans = np.copy(np.isnan(pto))
+
+            pto[tf_nans] = 0
+            tfo[tf_nans] = 0
+            pto[pt_nans] = 0
+            tfo[pt_nans] = 0
+
+            max_diff = np.amax(np.abs(tfo - pto))
+            self.assertLessEqual(max_diff, 6e-2)
+
+    def test_save_load(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
+                return_obj_labels="PreTraining" in model_class.__name__
+            )
+
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+                after_outputs = model(self._prepare_for_class(inputs_dict, model_class))
+
+                self.assert_outputs_same(after_outputs, outputs)
+
+    def test_compile_tf_model(self):
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
+                return_obj_labels="PreTraining" in model_class.__name__
+            )
+
+            input_ids = tf.keras.Input(
+                batch_shape=(self.model_tester.batch_size, self.model_tester.seq_length),
+                name="input_ids",
+                dtype="int32",
+            )
+            visual_feats = tf.keras.Input(
+                batch_shape=(
+                    self.model_tester.batch_size,
+                    self.model_tester.num_visual_features,
+                    self.model_tester.visual_feat_dim,
+                ),
+                name="visual_feats",
+                dtype="int32",
+            )
+            visual_pos = tf.keras.Input(
+                batch_shape=(self.model_tester.batch_size, self.model_tester.num_visual_features, 4),
+                name="visual_pos",
+                dtype="int32",
+            )
+
+            # Prepare our model
+            model = model_class(config)
+
+            # Let's load it from the disk to be sure we can use pretrained weights
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                outputs = model(self._prepare_for_class(inputs_dict, model_class))  # build the model
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+
+            outputs_dict = model(input_ids, visual_feats, visual_pos)
+            hidden_states = outputs_dict[0]
+
+            # Add a dense layer on top to test integration with other keras modules
+            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+
+            # Compile extended model
+            extended_model = tf.keras.Model(inputs=[input_ids, visual_feats, visual_pos], outputs=[outputs])
+            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_lm_models = [TFLxmertForPreTraining]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in list_lm_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    @slow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = tf.keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+                language_hidden_states = outputs["language_hidden_states"]
+                vision_hidden_states = outputs["vision_hidden_states"]
+                language_attentions = outputs["language_attentions"]
+                vision_attentions = outputs["vision_attentions"]
+                cross_encoder_attentions = outputs["cross_encoder_attentions"]
+
+                self.assertEqual(len(outputs), num_out)
+
+                self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
+                self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
+
+                seq_length = self.model_tester.seq_length
+                num_visual_features = self.model_tester.num_visual_features
+
+                self.assertListEqual(
+                    list(language_hidden_states[0].shape[-2:]),
+                    [seq_length, self.model_tester.hidden_size],
+                )
+                self.assertListEqual(
+                    list(vision_hidden_states[0].shape[-2:]),
+                    [num_visual_features, self.model_tester.hidden_size],
+                )
+
+                self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
+                self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
+                self.assertEqual(len(cross_encoder_attentions), self.model_tester.num_hidden_layers["cross_encoder"])
+
+                attentions = [language_attentions, vision_attentions, cross_encoder_attentions]
+                attention_shapes = [
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                    [
+                        self.model_tester.num_attention_heads,
+                        self.model_tester.num_visual_features,
+                        self.model_tester.num_visual_features,
+                    ],
+                    [self.model_tester.num_attention_heads, encoder_key_length, self.model_tester.num_visual_features],
+                ]
+
+                for attention, attention_shape in zip(attentions, attention_shapes):
+                    self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
diff --git a/test_modeling_tf_marian.py b/test_modeling_tf_marian.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db80bccfe0b127bfa9205a8d54a82a48742bb74
--- /dev/null
+++ b/test_modeling_tf_marian.py
@@ -0,0 +1,434 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import tempfile
+import unittest
+import warnings
+
+from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFAutoModelForSeq2SeqLM, TFMarianModel, TFMarianMTModel
+
+
+@require_tf
+class TFMarianModelTester:
+    config_cls = MarianConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFMarianModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        head_mask = inputs_dict["head_mask"]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_marian_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_tf
+class TFMarianModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFMarianMTModel, TFMarianModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFMarianMTModel,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFMarianModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MarianConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pre-trained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+@require_tf
+class AbstractMarianIntegrationTest(unittest.TestCase):
+    maxDiff = 1000  # show more chars for failing integration tests
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
+        return cls
+
+    @cached_property
+    def tokenizer(self) -> MarianTokenizer:
+        return AutoTokenizer.from_pretrained(self.model_name)
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @cached_property
+    def model(self):
+        warnings.simplefilter("error")
+        model: TFMarianMTModel = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+        assert isinstance(model, TFMarianMTModel)
+        c = model.config
+        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
+        self.assertEqual(c.max_length, 512)
+        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
+        return model
+
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        self.assertListEqual(self.expected_text, generated_words)
+
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, padding=True, return_tensors="tf")
+        generated_ids = self.model.generate(
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
+        return generated_words
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_tf
+class TestMarian_MT_EN(AbstractMarianIntegrationTest):
+    """Cover low resource/high perplexity setting. This breaks if pad_token_id logits not set to LARGE_NEGATIVE."""
+
+    src = "mt"
+    tgt = "en"
+    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
+    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
+
+    @slow
+    def test_batch_generation_mt_en(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_tf
+class TestMarian_en_zh(AbstractMarianIntegrationTest):
+    src = "en"
+    tgt = "zh"
+    src_text = ["My name is Wolfgang and I live in Berlin"]
+    expected_text = ["我叫沃尔夫冈 我住在柏林"]
+
+    @slow
+    def test_batch_generation_en_zh(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_tf
+class TestMarian_en_ROMANCE(AbstractMarianIntegrationTest):
+    """Multilingual on target side."""
+
+    src = "en"
+    tgt = "ROMANCE"
+    src_text = [
+        ">>fr<< Don't spend so much time watching TV.",
+        ">>pt<< Your message has been sent.",
+        ">>es<< He's two years older than me.",
+    ]
+    expected_text = [
+        "Ne passez pas autant de temps à regarder la télé.",
+        "A sua mensagem foi enviada.",
+        "Es dos años más viejo que yo.",
+    ]
+
+    @slow
+    def test_batch_generation_en_ROMANCE_multi(self):
+        self._assert_generated_batch_equal_expected()
+
+    @slow
+    def test_pipeline(self):
+        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="tf")
+        output = pipeline(self.src_text)
+        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
diff --git a/test_modeling_tf_mbart.py b/test_modeling_tf_mbart.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69aee3c0755c965378c01e00316ac03b07469f6
--- /dev/null
+++ b/test_modeling_tf_mbart.py
@@ -0,0 +1,345 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+from transformers import AutoTokenizer, MBartConfig, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFAutoModelForSeq2SeqLM, TFMBartForConditionalGeneration, TFMBartModel
+
+
+@require_tf
+class TFMBartModelTester:
+    config_cls = MBartConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFMBartModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        head_mask = inputs_dict["head_mask"]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pretrained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+
+def prepare_mbart_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_tf
+class TFMBartModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFMBartForConditionalGeneration, TFMBartModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFMBartForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFMBartModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MBartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+TOLERANCE = 1e-4
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_tf
+class TFMBartModelIntegrationTest(unittest.TestCase):
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+    ]
+    expected_text = [
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
+    ]
+    model_name = "facebook/mbart-large-en-ro"
+
+    @cached_property
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.model_name)
+
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+        return model
+
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        self.assertListEqual(self.expected_text, generated_words)
+
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, return_tensors="tf")
+        generated_ids = self.model.generate(
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        return generated_words
+
+    @slow
+    def test_batch_generation_en_ro(self):
+        self._assert_generated_batch_equal_expected()
diff --git a/test_modeling_tf_mobilebert.py b/test_modeling_tf_mobilebert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4150204a2af52466a7cbdd0096062531122b29ab
--- /dev/null
+++ b/test_modeling_tf_mobilebert.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import MobileBertConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFMobileBertForMaskedLM,
+        TFMobileBertForMultipleChoice,
+        TFMobileBertForNextSentencePrediction,
+        TFMobileBertForPreTraining,
+        TFMobileBertForQuestionAnswering,
+        TFMobileBertForSequenceClassification,
+        TFMobileBertForTokenClassification,
+        TFMobileBertModel,
+    )
+
+
+@require_tf
+class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFMobileBertModel,
+            TFMobileBertForMaskedLM,
+            TFMobileBertForNextSentencePrediction,
+            TFMobileBertForPreTraining,
+            TFMobileBertForQuestionAnswering,
+            TFMobileBertForSequenceClassification,
+            TFMobileBertForTokenClassification,
+            TFMobileBertForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    class TFMobileBertModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            embedding_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+            self.embedding_size = embedding_size
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = MobileBertConfig(
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range,
+                embedding_size=self.embedding_size,
+            )
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_mobilebert_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFMobileBertModel(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            result = model(inputs)
+
+            result = model(input_ids)
+
+            self.parent.assertEqual(
+                result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)
+            )
+            self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+        def create_and_check_mobilebert_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFMobileBertForMaskedLM(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+        def create_and_check_mobilebert_for_next_sequence_prediction(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFMobileBertForNextSentencePrediction(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+        def create_and_check_mobilebert_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFMobileBertForPreTraining(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+            self.parent.assertEqual(
+                result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
+            )
+            self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+        def create_and_check_mobilebert_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = TFMobileBertForSequenceClassification(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+        def create_and_check_mobilebert_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_choices = self.num_choices
+            model = TFMobileBertForMultipleChoice(config=config)
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+            inputs = {
+                "input_ids": multiple_choice_inputs_ids,
+                "attention_mask": multiple_choice_input_mask,
+                "token_type_ids": multiple_choice_token_type_ids,
+            }
+            result = model(inputs)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+        def create_and_check_mobilebert_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = TFMobileBertForTokenClassification(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+        def create_and_check_mobilebert_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFMobileBertForQuestionAnswering(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            result = model(inputs)
+            self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+            self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFMobileBertModelTest.TFMobileBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mobilebert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mobilebert_for_token_classification(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_lm_models = [TFMobileBertForMaskedLM, TFMobileBertForPreTraining]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in list_lm_models:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        # for model_name in TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["google/mobilebert-uncased"]:
+            model = TFMobileBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFMobileBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFMobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 30522]
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [-4.5919547, -9.248295, -9.645256],
+                    [-6.7306175, -6.440284, -6.6052837],
+                    [-7.2743506, -6.7847915, -6.024673],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_mpnet.py b/test_modeling_tf_mpnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0305dede979f52e0fa2ab2f4d7c1c916c3b5e00
--- /dev/null
+++ b/test_modeling_tf_mpnet.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, Microsoft Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import MPNetConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.mpnet.modeling_tf_mpnet import (
+        TFMPNetForMaskedLM,
+        TFMPNetForMultipleChoice,
+        TFMPNetForQuestionAnswering,
+        TFMPNetForSequenceClassification,
+        TFMPNetForTokenClassification,
+        TFMPNetModel,
+    )
+
+
+class TFMPNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=64,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MPNetConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_mpnet_model(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFMPNetModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_mpnet_for_masked_lm(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFMPNetForMaskedLM(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_mpnet_for_question_answering(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFMPNetForQuestionAnswering(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_mpnet_for_sequence_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFMPNetForSequenceClassification(config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_mpnet_for_multiple_choice(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFMPNetForMultipleChoice(config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_mpnet_for_token_classification(
+        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFMPNetForTokenClassification(config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFMPNetModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFMPNetForMaskedLM,
+            TFMPNetForMultipleChoice,
+            TFMPNetForQuestionAnswering,
+            TFMPNetForSequenceClassification,
+            TFMPNetForTokenClassification,
+            TFMPNetModel,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFMPNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_mpnet_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_sequence_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_multiple_choice(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_mpnet_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ["microsoft/mpnet-base"]:
+            model = TFMPNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFMPNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFMPNetModel.from_pretrained("microsoft/mpnet-base")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = [1, 6, 768]
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [
+                    [-0.1067172, 0.08216473, 0.0024543],
+                    [-0.03465879, 0.8354118, -0.03252288],
+                    [-0.06569476, -0.12424111, -0.0494436],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
diff --git a/test_modeling_tf_mt5.py b/test_modeling_tf_mt5.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b23e05f7523f5aea28d6444ebe92d0ca2135e96
--- /dev/null
+++ b/test_modeling_tf_mt5.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFMT5ModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_small_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+
+        >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path, extra_ids=100)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = TFAutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
+        tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
+        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
+
+        loss = model(input_ids, labels=labels).loss
+        mtf_score = -tf.math.reduce_sum(loss).numpy()
+
+        EXPECTED_SCORE = -84.9127
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 2e-4)
diff --git a/test_modeling_tf_openai.py b/test_modeling_tf_openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dc684adb77ff681076f15cd7075bbde2f5f4ce7
--- /dev/null
+++ b/test_modeling_tf_openai.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import OpenAIGPTConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.openai.modeling_tf_openai import (
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFOpenAIGPTDoubleHeadsModel,
+        TFOpenAIGPTForSequenceClassification,
+        TFOpenAIGPTLMHeadModel,
+        TFOpenAIGPTModel,
+    )
+
+
+class TFOpenAIGPTModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_token_type_ids = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.use_mc_token_ids = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = OpenAIGPTConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            # intermediate_size=self.intermediate_size,
+            # hidden_act=self.hidden_act,
+            # hidden_dropout_prob=self.hidden_dropout_prob,
+            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            n_ctx=self.max_position_embeddings,
+            # type_vocab_size=self.type_vocab_size,
+            # initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFOpenAIGPTModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = TFOpenAIGPTLMHeadModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_openai_gpt_double_head(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+    ):
+        model = TFOpenAIGPTDoubleHeadsModel(config=config)
+
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "mc_token_ids": mc_token_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
+        )
+        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_openai_gpt_for_sequence_classification(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        config.num_labels = self.num_labels
+        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+            "labels": sequence_labels,
+        }
+        model = TFOpenAIGPTForSequenceClassification(config)
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TFOpenAIGPTForSequenceClassification)
+        if is_tf_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (TFOpenAIGPTLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFOpenAIGPTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_openai_gpt_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
+
+    def test_openai_gpt_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_lm_head(*config_and_inputs)
+
+    def test_openai_gpt_double_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_openai_gpt_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_openai_gpt_for_sequence_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFOpenAIGPTModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFOPENAIGPTModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_openai_gpt(self):
+        model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
+        input_ids = tf.convert_to_tensor([[481, 4735, 544]], dtype=tf.int32)  # the president is
+        expected_output_ids = [
+            481,
+            4735,
+            544,
+            246,
+            963,
+            870,
+            762,
+            239,
+            244,
+            40477,
+            244,
+            249,
+            719,
+            881,
+            487,
+            544,
+            240,
+            244,
+            603,
+            481,
+        ]  # the president is a very good man. " \n " i\'m sure he is, " said the
+
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_tf_pegasus.py b/test_modeling_tf_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dc4e9ae9cb78c1137563b2f378bf6139dbcda35
--- /dev/null
+++ b/test_modeling_tf_pegasus.py
@@ -0,0 +1,374 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+from transformers import AutoTokenizer, PegasusConfig, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFAutoModelForSeq2SeqLM, TFPegasusForConditionalGeneration, TFPegasusModel
+
+
+@require_tf
+class TFPegasusModelTester:
+    config_cls = PegasusConfig
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TFPegasusModel(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        head_mask = inputs_dict["head_mask"]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_pegasus_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.concat(
+            [
+                tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8),
+                tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8),
+            ],
+            axis=-1,
+        )
+    if head_mask is None:
+        head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads))
+    if decoder_head_mask is None:
+        decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads))
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_tf
+class TFPegasusModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFPegasusForConditionalGeneration, TFPegasusModel) if is_tf_available() else ()
+    all_generative_model_classes = (TFPegasusForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFPegasusModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_compile_tf_model(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+
+        model_class = self.all_generative_model_classes[0]
+        input_ids = {
+            "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
+            "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+        }
+
+        # Prepare our model
+        model = model_class(config)
+        model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
+        # Let's load it from the disk to be sure we can use pretrained weights
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            model = model_class.from_pretrained(tmpdirname)
+
+        outputs_dict = model(input_ids)
+        hidden_states = outputs_dict[0]
+
+        # Add a dense layer on top to test integration with other keras modules
+        outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
+
+        # Compile extended model
+        extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
+        extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert isinstance(name, dict)
+                for k, v in name.items():
+                    assert isinstance(v, tf.Variable)
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    def test_resize_token_embeddings(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def _get_word_embedding_weight(model, embedding_layer):
+            if hasattr(embedding_layer, "weight"):
+                return embedding_layer.weight
+            else:
+                # Here we build the word embeddings weights if not exists.
+                # And then we retry to get the attribute once built.
+                model(model.dummy_inputs)
+                if hasattr(embedding_layer, "weight"):
+                    return embedding_layer.weight
+                else:
+                    return None
+
+        for model_class in self.all_model_classes:
+            for size in [config.vocab_size - 10, config.vocab_size + 10, None]:
+                # build the embeddings
+                model = model_class(config=config)
+                old_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                old_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                old_final_logits_bias = model.get_bias()
+
+                # reshape the embeddings
+                model.resize_token_embeddings(size)
+                new_input_embeddings = _get_word_embedding_weight(model, model.get_input_embeddings())
+                new_output_embeddings = _get_word_embedding_weight(model, model.get_output_embeddings())
+                new_final_logits_bias = model.get_bias()
+
+                # check that the resized embeddings size matches the desired size.
+                assert_size = size if size is not None else config.vocab_size
+
+                self.assertEqual(new_input_embeddings.shape[0], assert_size)
+
+                # check that weights remain the same after resizing
+                models_equal = True
+                for p1, p2 in zip(old_input_embeddings.value(), new_input_embeddings.value()):
+                    if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                        models_equal = False
+                self.assertTrue(models_equal)
+
+                if old_output_embeddings is not None and new_output_embeddings is not None:
+                    self.assertEqual(new_output_embeddings.shape[0], assert_size)
+
+                    models_equal = True
+                    for p1, p2 in zip(old_output_embeddings.value(), new_output_embeddings.value()):
+                        if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                            models_equal = False
+                    self.assertTrue(models_equal)
+
+                if old_final_logits_bias is not None and new_final_logits_bias is not None:
+                    old_final_logits_bias = old_final_logits_bias["final_logits_bias"]
+                    new_final_logits_bias = new_final_logits_bias["final_logits_bias"]
+                    self.assertEqual(new_final_logits_bias.shape[0], 1)
+                    self.assertEqual(new_final_logits_bias.shape[1], assert_size)
+
+                    models_equal = True
+                    for old, new in zip(old_final_logits_bias.value(), new_final_logits_bias.value()):
+                        for p1, p2 in zip(old, new):
+                            if tf.math.reduce_sum(tf.math.abs(p1 - p2)) > 0:
+                                models_equal = False
+                    self.assertTrue(models_equal)
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        if len(prefix) > 0:
+            prefix = f"{prefix}: "
+        raise AssertionError(f"{prefix}{a} != {b}")
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_tf
+class TFPegasusIntegrationTests(unittest.TestCase):
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
+        """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
+    ]
+    expected_text = [
+        "California's largest electricity provider has cut power to hundreds of thousands of customers in an effort to reduce the risk of wildfires.",
+        'N-Dubz have revealed they\'re "grateful" to have been nominated for four Mobo Awards.',
+    ]  # differs slightly from pytorch, likely due to numerical differences in linear layers
+    model_name = "google/pegasus-xsum"
+
+    @cached_property
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.model_name)
+
+    @cached_property
+    def model(self):
+        model = TFAutoModelForSeq2SeqLM.from_pretrained(self.model_name)
+        return model
+
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        assert self.expected_text == generated_words
+
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer(self.src_text, **tokenizer_kwargs, padding=True, return_tensors="tf")
+        generated_ids = self.model.generate(
+            model_inputs.input_ids,
+            attention_mask=model_inputs.attention_mask,
+            num_beams=2,
+            use_cache=True,
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids.numpy(), skip_special_tokens=True)
+        return generated_words
+
+    @slow
+    def test_batch_generation(self):
+        self._assert_generated_batch_equal_expected()
diff --git a/test_modeling_tf_pytorch.py b/test_modeling_tf_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4d88e12d429acc254f23ac6a08baf5b606299d0
--- /dev/null
+++ b/test_modeling_tf_pytorch.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_tf_available, is_torch_available
+from transformers.testing_utils import DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, is_pt_tf_cross_test, slow
+
+
+if is_tf_available():
+    from transformers import (
+        AutoConfig,
+        BertConfig,
+        GPT2Config,
+        T5Config,
+        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForPreTraining,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelWithLMHead,
+        TFBertForMaskedLM,
+        TFBertForPreTraining,
+        TFBertForQuestionAnswering,
+        TFBertForSequenceClassification,
+        TFBertModel,
+        TFGPT2LMHeadModel,
+        TFRobertaForMaskedLM,
+        TFT5ForConditionalGeneration,
+    )
+    from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.models.t5.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_torch_available():
+    from transformers import (
+        AutoModel,
+        AutoModelForCausalLM,
+        AutoModelForMaskedLM,
+        AutoModelForPreTraining,
+        AutoModelForQuestionAnswering,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelWithLMHead,
+        BertForMaskedLM,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertModel,
+        GPT2LMHeadModel,
+        RobertaForMaskedLM,
+        T5ForConditionalGeneration,
+    )
+
+
+@is_pt_tf_cross_test
+class TFPTAutoModelTest(unittest.TestCase):
+    @slow
+    def test_model_from_pretrained(self):
+        import h5py
+
+        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
+
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModel.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertModel)
+
+            model = AutoModel.from_pretrained(model_name, from_tf=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertModel)
+
+    @slow
+    def test_model_for_pretraining_from_pretrained(self):
+        import h5py
+
+        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
+
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForPreTraining.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForPreTraining)
+
+            model = AutoModelForPreTraining.from_pretrained(model_name, from_tf=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForPreTraining)
+
+    @slow
+    def test_model_for_causal_lm(self):
+        for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, GPT2Config)
+
+            model = TFAutoModelForCausalLM.from_pretrained(model_name, from_pt=True)
+            model, loading_info = TFAutoModelForCausalLM.from_pretrained(
+                model_name, output_loading_info=True, from_pt=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFGPT2LMHeadModel)
+
+            model = AutoModelForCausalLM.from_pretrained(model_name, from_tf=True)
+            model, loading_info = AutoModelForCausalLM.from_pretrained(
+                model_name, output_loading_info=True, from_tf=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, GPT2LMHeadModel)
+
+    @slow
+    def test_lmhead_model_from_pretrained(self):
+        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelWithLMHead.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name, from_tf=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    @slow
+    def test_model_for_masked_lm(self):
+        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForMaskedLM.from_pretrained(model_name, from_pt=True)
+            model, loading_info = TFAutoModelForMaskedLM.from_pretrained(
+                model_name, output_loading_info=True, from_pt=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+
+            model = AutoModelForMaskedLM.from_pretrained(model_name, from_tf=True)
+            model, loading_info = AutoModelForMaskedLM.from_pretrained(
+                model_name, output_loading_info=True, from_tf=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    @slow
+    def test_model_for_encoder_decoder_lm(self):
+        for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, T5Config)
+
+            model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, from_pt=True)
+            model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(
+                model_name, output_loading_info=True, from_pt=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFT5ForConditionalGeneration)
+
+            model = AutoModelForSeq2SeqLM.from_pretrained(model_name, from_tf=True)
+            model, loading_info = AutoModelForSeq2SeqLM.from_pretrained(
+                model_name, output_loading_info=True, from_tf=True
+            )
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, T5ForConditionalGeneration)
+
+    @slow
+    def test_sequence_classification_model_from_pretrained(self):
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForSequenceClassification)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name, from_tf=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+
+    @slow
+    def test_question_answering_model_from_pretrained(self):
+        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in ["bert-base-uncased"]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForQuestionAnswering)
+
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name, from_tf=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
+
+    def test_from_pretrained_identifier(self):
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_pt=True)
+        self.assertIsInstance(model, TFBertForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_tf=True)
+        self.assertIsInstance(model, BertForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+    def test_from_identifier_from_model_type(self):
+        model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER, from_pt=True)
+        self.assertIsInstance(model, TFRobertaForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
+
+        model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER, from_tf=True)
+        self.assertIsInstance(model, RobertaForMaskedLM)
+        self.assertEqual(model.num_parameters(), 14410)
+        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
diff --git a/test_modeling_tf_rag.py b/test_modeling_tf_rag.py
new file mode 100644
index 0000000000000000000000000000000000000000..679b25aa982a065d27401c61e48497ad6f17a444
--- /dev/null
+++ b/test_modeling_tf_rag.py
@@ -0,0 +1,1070 @@
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+
+from transformers import BartTokenizer
+from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_tf_available
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
+from transformers.models.dpr.tokenization_dpr import DPRQuestionEncoderTokenizer
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+
+if is_tf_available() and is_datasets_available() and is_faiss_available():
+    import tensorflow as tf
+    from datasets import Dataset
+    import faiss
+
+    from transformers import (
+        AutoConfig,
+        RagConfig,
+        RagRetriever,
+        RagTokenizer,
+        TFAutoModel,
+        TFAutoModelForSeq2SeqLM,
+        TFRagModel,
+        TFRagSequenceForGeneration,
+        TFRagTokenForGeneration,
+    )
+
+    from transformers.modeling_tf_outputs import TFBaseModelOutput
+
+from .test_modeling_tf_bart import TFBartModelTester
+from .test_modeling_tf_dpr import TFDPRModelTester
+
+
+TOLERANCE = 1e-3
+
+
+def require_retrieval(test_case):
+    """
+    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
+    :class:`~transformers.RagRetriever`.
+
+    These tests are skipped when respective libraries are not installed.
+
+    """
+    if not (is_tf_available() and is_datasets_available() and is_faiss_available()):
+        test_case = unittest.skip("test requires tensorflow, datasets and faiss")(test_case)
+    return test_case
+
+
+@require_tf
+@require_retrieval
+@require_sentencepiece
+class TFRagTestMixin:
+
+    all_model_classes = (
+        (TFRagModel, TFRagTokenForGeneration, TFRagSequenceForGeneration)
+        if is_tf_available() and is_datasets_available() and is_faiss_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (TFRagTokenForGeneration, TFRagSequenceForGeneration)
+        if is_tf_available() and is_datasets_available() and is_faiss_available()
+        else ()
+    )
+
+    retrieval_vector_size = 32
+    n_docs = 3
+    max_combined_length = 16
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        # DPR tok
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
+        os.makedirs(dpr_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+        # BART tok
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
+        os.makedirs(bart_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    @cached_property
+    def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
+        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
+
+    @cached_property
+    def bart_tokenizer(self) -> BartTokenizer:
+        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def get_retriever(self, config):
+        dataset = Dataset.from_dict(
+            {
+                "id": ["0", "1", "3"],
+                "text": ["foo", "bar", "qux"],
+                "title": ["Foo", "Bar", "Qux"],
+                "embeddings": [
+                    np.ones(self.retrieval_vector_size),
+                    2 * np.ones(self.retrieval_vector_size),
+                    3 * np.ones(self.retrieval_vector_size),
+                ],
+            }
+        )
+        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
+        tokenizer = self.bart_tokenizer
+        with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
+            mock_load_dataset.return_value = dataset
+            retriever = RagRetriever(
+                config,
+                question_encoder_tokenizer=self.dpr_tokenizer,
+                generator_tokenizer=tokenizer,
+            )
+        return retriever
+
+    def check_model_with_retriever(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config, retriever=self.get_retriever(config))
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def check_model_generate_from_context_input_ids(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for i, model_class in enumerate(self.all_generative_model_classes):
+            model = model_class(config)
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="tf",
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
+
+            # compute doc_scores
+            doc_scores = tf.squeeze(
+                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
+                axis=[1],
+            )
+
+            outputs = model.generate(
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+            )
+
+            self.assertIsNotNone(outputs)
+
+    def check_model_generate(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config, retriever=self.get_retriever(config))
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            input_ids = tf.cast(input_ids, tf.int32)
+            outputs = model.generate(
+                input_ids=input_ids,
+                num_beams=2,
+                num_return_sequences=2,
+                decoder_start_token_id=config.generator.eos_token_id,
+            )
+
+            self.assertIsNotNone(outputs)
+
+    def check_model_without_retriever(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="tf",
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
+
+            # compute doc_scores
+            doc_scores = tf.squeeze(
+                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
+                axis=[1],
+            )
+
+            outputs = model(
+                input_ids=None,
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def check_model_custom_n_docs(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, n_docs, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="tf",
+                n_docs=n_docs,
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
+
+            # compute doc_scores
+            doc_scores = tf.squeeze(
+                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
+                axis=[1],
+            )
+
+            outputs = model(
+                input_ids=None,
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                n_docs=n_docs,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], n_docs))
+
+    def check_model_with_mismatch_n_docs_value(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        retriever_n_docs,
+        generator_n_docs,
+        **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        retriever = self.get_retriever(config)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
+
+            out = retriever(
+                input_ids,
+                question_hidden_states.numpy(),
+                prefix=config.generator.prefix,
+                return_tensors="tf",
+                n_docs=retriever_n_docs,
+            )
+
+            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+                out["context_input_ids"],
+                out["context_attention_mask"],
+                out["retrieved_doc_embeds"],
+            )
+
+            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)
+
+            # compute doc_scores
+            doc_scores = tf.squeeze(
+                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True),
+                axis=[1],
+            )
+
+            self.assertRaises(
+                AssertionError,
+                model.__call__,
+                input_ids=None,
+                context_input_ids=context_input_ids,
+                context_attention_mask=context_attention_mask,
+                doc_scores=doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                n_docs=generator_n_docs,
+            )
+
+    def check_model_with_encoder_outputs(
+        self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
+    ):
+        self.assertIsNotNone(config.question_encoder)
+        self.assertIsNotNone(config.generator)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config, retriever=self.get_retriever(config))
+
+            self.assertTrue(model.config.is_encoder_decoder)
+
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            encoder_outputs = TFBaseModelOutput(outputs.generator_enc_last_hidden_state)
+
+            # run only generator
+            outputs = model(
+                input_ids=None,
+                encoder_outputs=encoder_outputs,
+                doc_scores=outputs.doc_scores,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+            )
+
+            # logits
+            self.assertEqual(
+                outputs.logits.shape,
+                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
+            )
+            # generator encoder last hidden states
+            self.assertEqual(
+                outputs.generator_enc_last_hidden_state.shape,
+                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
+            )
+            # doc scores
+            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
+
+    def test_model_with_retriever(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_with_retriever(**inputs_dict)
+
+    def test_model_without_retriever(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_without_retriever(**inputs_dict)
+
+    def test_model_generate_from_context_input_ids(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_generate_from_context_input_ids(**inputs_dict)
+
+    def test_model_with_encoder_outputs(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_with_encoder_outputs(**inputs_dict)
+
+    def test_model_generate(self):
+        inputs_dict = self.config_and_inputs
+        self.check_model_generate(**inputs_dict)
+
+    def test_model_with_custom_n_docs(self):
+        inputs_dict = self.config_and_inputs
+        inputs_dict["n_docs"] = 1
+        self.check_model_custom_n_docs(**inputs_dict)
+
+    def test_model_with_mismatch_n_docs_value(self):
+        inputs_dict = self.config_and_inputs
+        inputs_dict["retriever_n_docs"] = 3
+        inputs_dict["generator_n_docs"] = 2
+        self.check_model_with_mismatch_n_docs_value(**inputs_dict)
+
+
+@require_tf
+@require_retrieval
+class TFRagDPRBartTest(TFRagTestMixin, unittest.TestCase):
+    @cached_property
+    def config_and_inputs(self):
+        question_encoder_tester = TFDPRModelTester(self)
+        dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
+        generator_tester = TFBartModelTester(self)
+        bart_config_and_inputs = generator_tester.prepare_config_and_inputs_for_common()
+
+        (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
+        (generator_config, bart_inputs_dict) = bart_config_and_inputs
+        decoder_input_ids, decoder_attention_mask = bart_inputs_dict["input_ids"], bart_inputs_dict["attention_mask"]
+
+        config = RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            n_docs=self.n_docs,
+            retrieval_vector_size=self.retrieval_vector_size,
+            max_combined_length=self.max_combined_length,
+        )
+
+        return {
+            "config": config,
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+        }
+
+
+@require_tf
+@require_retrieval
+@require_sentencepiece
+@require_tokenizers
+class TFRagModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def token_model(self):
+        return TFRagTokenForGeneration.from_pretrained_question_encoder_generator(
+            "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
+        )
+
+    @cached_property
+    def sequence_model(self):
+        return TFRagSequenceForGeneration.from_pretrained_question_encoder_generator(
+            "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
+        )
+
+    def token_model_nq_checkpoint(self, retriever):
+        return TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+
+    def get_rag_config(self):
+        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
+        return RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            bos_token_id=0,
+            decoder_start_token_id=2,
+            eos_token_id=2,
+            is_encoder_decoder=True,
+            pad_token_id=1,
+            vocab_size=50264,
+            title_sep=" / ",
+            doc_sep=" // ",
+            n_docs=5,
+            max_combined_length=300,
+            dataset="wiki_dpr",
+            dataset_split="train",
+            index_name="exact",
+            index_path=None,
+            use_dummy_dataset=True,
+            retrieval_vector_size=768,
+            retrieval_batch_size=8,
+        )
+
+    @slow
+    def test_rag_sequence_inference(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_sequence = self.sequence_model
+        rag_sequence.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        output = rag_sequence(
+            input_ids,
+            labels=decoder_input_ids,
+        )
+
+        expected_shape = tf.TensorShape([5, 5, 50264])
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_doc_scores = tf.convert_to_tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
+        expected_loss = tf.convert_to_tensor([36.7368])
+
+        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
+        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
+
+    @slow
+    def test_rag_token_inference(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_token = self.token_model
+        rag_token.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        output = rag_token(
+            input_ids,
+            labels=decoder_input_ids,
+        )
+
+        expected_shape = tf.TensorShape([5, 5, 50264])
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_doc_scores = tf.convert_to_tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
+        expected_loss = tf.convert_to_tensor([36.3557])
+
+        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
+        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
+
+    @slow
+    def test_rag_token_inference_nq_checkpoint(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_token = self.token_model_nq_checkpoint(retriever=rag_retriever)
+
+        # check that outputs after saving and loading are equal
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            rag_token.save_pretrained(tmpdirname)
+            rag_token = TFRagTokenForGeneration.from_pretrained(tmpdirname, retriever=rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        output = rag_token(
+            input_ids,
+            labels=decoder_input_ids,
+        )
+
+        expected_shape = tf.TensorShape([5, 5, 50265])
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_doc_scores = tf.convert_to_tensor([[62.9402, 62.7107, 62.2382, 62.1194, 61.8578]])
+        expected_loss = tf.convert_to_tensor([32.521812])
+
+        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
+        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
+
+    @slow
+    def test_rag_token_inference_save_pretrained(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_token = self.token_model
+        rag_token.set_retriever(rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        # model must run once to be functional before loading/saving works
+        rag_token(
+            input_ids,
+            labels=decoder_input_ids,
+        )
+
+        # check that outputs after saving and loading are equal
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            rag_token.save_pretrained(tmpdirname)
+            rag_token = TFRagTokenForGeneration.from_pretrained(tmpdirname, retriever=rag_retriever)
+
+        output = rag_token(
+            input_ids,
+            labels=decoder_input_ids,
+        )
+
+        expected_shape = tf.TensorShape([5, 5, 50264])
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_doc_scores = tf.convert_to_tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
+        expected_loss = tf.convert_to_tensor([36.3557])
+
+        tf.debugging.assert_near(output.loss, expected_loss, atol=1e-3)
+        tf.debugging.assert_near(output.doc_scores, expected_doc_scores, atol=1e-3)
+
+    @slow
+    def test_init_and_from_pretrained(self):
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        rag_config = RagConfig.from_pretrained("facebook/rag-sequence-base")
+        rag = TFRagTokenForGeneration(rag_config, retriever=rag_retriever)
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        rag(
+            input_ids,
+            decoder_input_ids=decoder_input_ids,
+        )
+
+        # this should not give any warnings
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            rag.save_pretrained(tmpdirname)
+            rag = TFRagTokenForGeneration.from_pretrained(tmpdirname, retriever=rag_retriever)
+
+    @property
+    def test_data_questions(self):
+        return [
+            "who got the first nobel prize in physics",
+            "when is the next deadpool movie being released",
+            "which mode is used for short wave broadcast service",
+            "who is the owner of reading football club",
+            "when is the next scandal episode coming out",
+            "when is the last time the philadelphia won the superbowl",
+            "what is the most current adobe flash player version",
+            "how many episodes are there in dragon ball z",
+        ]
+
+    @slow
+    def test_rag_token_greedy_search(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+
+        # check first two questions
+        input_dict = tokenizer(
+            self.test_data_questions[:2],
+            return_tensors="tf",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids
+        attention_mask = input_dict.attention_mask
+
+        # make sure only 1 beam is used
+        rag_token.config.num_beams = 1
+
+        output_ids = rag_token.generate(
+            input_ids,
+            attention_mask=attention_mask,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " september 22, 2017",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+    @slow
+    def test_rag_token_generate_batch(self):
+        # NOTE: gold labels comes from num_beam=4, so this is effectively beam-search test
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+        retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        rag_token = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
+
+        input_dict = tokenizer(
+            self.test_data_questions,
+            return_tensors="tf",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids
+        attention_mask = input_dict.attention_mask
+
+        output_ids = rag_token.generate(
+            input_ids,
+            attention_mask=attention_mask,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " september 22, 2017",
+            " amplitude modulation",
+            " stefan persson",
+            " april 20, 2018",
+            " the 1970s",
+            " 7.1. 2",
+            " 13",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+    @slow
+    def test_rag_sequence_generate_batch(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        retriever = RagRetriever.from_pretrained(
+            "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
+        )
+        rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
+
+        input_dict = tokenizer(
+            self.test_data_questions,
+            return_tensors="tf",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids
+        attention_mask = input_dict.attention_mask
+
+        output_ids = rag_sequence.generate(
+            input_ids,
+            attention_mask=attention_mask,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " june 22, 2018",
+            " amplitude modulation",
+            " tim besley ( chairman )",
+            " june 20, 2018",
+            " 1980",
+            " 7.0",
+            " 8",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+    @slow
+    def test_rag_sequence_generate_batch_from_context_input_ids(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        retriever = RagRetriever.from_pretrained(
+            "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
+        )
+        rag_sequence = TFRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
+        input_dict = tokenizer(
+            self.test_data_questions,
+            return_tensors="tf",
+            padding=True,
+            truncation=True,
+        )
+
+        input_ids = input_dict.input_ids
+
+        question_hidden_states = rag_sequence.question_encoder(input_ids)[0]
+        docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
+        doc_scores = tf.squeeze(
+            tf.matmul(
+                tf.expand_dims(question_hidden_states, axis=[1]), docs_dict["retrieved_doc_embeds"], transpose_b=True
+            ),
+            axis=[1],
+        )
+        output_ids = rag_sequence.generate(
+            context_input_ids=docs_dict["context_input_ids"],
+            context_attention_mask=docs_dict["context_attention_mask"],
+            doc_scores=doc_scores,
+            do_deduplication=True,
+        )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+        EXPECTED_OUTPUTS = [
+            " albert einstein",
+            " june 22, 2018",
+            " amplitude modulation",
+            " tim besley ( chairman )",
+            " june 20, 2018",
+            " 1980",
+            " 7.0",
+            " 8",
+        ]
+        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
+
+
+@require_tf
+@require_retrieval
+class TFRagModelSaveLoadTests(unittest.TestCase):
+    def get_rag_config(self):
+        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
+        return RagConfig.from_question_encoder_generator_configs(
+            question_encoder_config,
+            generator_config,
+            bos_token_id=0,
+            decoder_start_token_id=2,
+            eos_token_id=2,
+            is_encoder_decoder=True,
+            pad_token_id=1,
+            vocab_size=50264,
+            title_sep=" / ",
+            doc_sep=" // ",
+            n_docs=5,
+            max_combined_length=300,
+            dataset="wiki_dpr",
+            dataset_split="train",
+            index_name="exact",
+            index_path=None,
+            use_dummy_dataset=True,
+            retrieval_vector_size=768,
+            retrieval_batch_size=8,
+        )
+
+    @slow
+    def test_rag_sequence_from_pretrained(self):
+        load_weight_prefix = "tf_rag_model_1"
+
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            rag_sequence = TFRagSequenceForGeneration.from_pretrained_question_encoder_generator(
+                "facebook/dpr-question_encoder-single-nq-base",
+                "facebook/bart-large-cnn",
+                retriever=rag_retriever,
+                config=rag_config,
+            )
+            # check that the from pretrained methods work
+            rag_sequence.save_pretrained(tmp_dirname)
+            rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever)
+
+            output = rag_sequence(input_ids, labels=decoder_input_ids)
+
+            loss_pretrained = output.loss
+            del rag_sequence
+
+        question_encoder = TFAutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator = TFAutoModelForSeq2SeqLM.from_pretrained(
+            "facebook/bart-large-cnn", load_weight_prefix=load_weight_prefix, name="generator"
+        )
+
+        rag_sequence = TFRagSequenceForGeneration(
+            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
+        )
+
+        output = rag_sequence(input_ids, labels=decoder_input_ids)
+
+        loss_init = output.loss
+
+        self.assertAlmostEqual(loss_pretrained, loss_init, places=4)
+
+    @slow
+    def test_rag_token_from_pretrained(self):
+        load_weight_prefix = "tf_rag_model_1"
+
+        rag_config = self.get_rag_config()
+        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
+        rag_retriever = RagRetriever(
+            rag_config,
+            question_encoder_tokenizer=rag_question_encoder_tokenizer,
+            generator_tokenizer=rag_decoder_tokenizer,
+        )
+
+        input_ids = rag_question_encoder_tokenizer(
+            "who sings does he love me with reba", return_tensors="tf"
+        ).input_ids
+        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
+
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            rag_token = TFRagTokenForGeneration.from_pretrained_question_encoder_generator(
+                "facebook/dpr-question_encoder-single-nq-base",
+                "facebook/bart-large-cnn",
+                retriever=rag_retriever,
+                config=rag_config,
+            )
+            # check that the from pretrained methods work
+            rag_token.save_pretrained(tmp_dirname)
+            rag_token.from_pretrained(tmp_dirname, retriever=rag_retriever)
+
+            output = rag_token(input_ids, labels=decoder_input_ids)
+
+            loss_pretrained = output.loss
+            del rag_token
+
+        question_encoder = TFAutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        generator = TFAutoModelForSeq2SeqLM.from_pretrained(
+            "facebook/bart-large-cnn", load_weight_prefix=load_weight_prefix, name="generator"
+        )
+        rag_token = TFRagTokenForGeneration(
+            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
+        )
+
+        output = rag_token(input_ids, labels=decoder_input_ids)
+
+        loss_init = output.loss
+
+        self.assertAlmostEqual(loss_pretrained, loss_init, places=4)
diff --git a/test_modeling_tf_roberta.py b/test_modeling_tf_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..d40652efc92abd164f33ed5d19419a8b5119e0c6
--- /dev/null
+++ b/test_modeling_tf_roberta.py
@@ -0,0 +1,264 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import RobertaConfig, is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import numpy
+    import tensorflow as tf
+
+    from transformers.models.roberta.modeling_tf_roberta import (
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFRobertaForMaskedLM,
+        TFRobertaForMultipleChoice,
+        TFRobertaForQuestionAnswering,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TFRobertaModel,
+    )
+
+
+class TFRobertaModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = RobertaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_roberta_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRobertaModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_roberta_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRobertaForMaskedLM(config=config)
+        result = model([input_ids, input_mask, token_type_ids])
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_roberta_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFRobertaForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_roberta_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRobertaForQuestionAnswering(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_roberta_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFRobertaForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFRobertaModel,
+            TFRobertaForMaskedLM,
+            TFRobertaForSequenceClassification,
+            TFRobertaForTokenClassification,
+            TFRobertaForQuestionAnswering,
+        )
+        if is_tf_available()
+        else ()
+    )
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFRobertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_roberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_token_classification(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_question_answering(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFRobertaModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFRobertaModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFRobertaForMaskedLM.from_pretrained("roberta-base")
+
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = [1, 11, 50265]
+        self.assertEqual(list(output.numpy().shape), expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
+        )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
+
+    @slow
+    def test_inference_no_head(self):
+        model = TFRobertaModel.from_pretrained("roberta-base")
+
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = tf.constant(
+            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
+        )
+        self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
+
+    @slow
+    def test_inference_classification_head(self):
+        model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
+
+        input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        output = model(input_ids)[0]
+        expected_shape = [1, 3]
+        self.assertEqual(list(output.numpy().shape), expected_shape)
+        expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]])
+        self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-4))
diff --git a/test_modeling_tf_roformer.py b/test_modeling_tf_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b045187d57e1fccf3b893ec3df27bd6346c42b9
--- /dev/null
+++ b/test_modeling_tf_roformer.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import RoFormerConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TFRoFormerForCausalLM,
+        TFRoFormerForMaskedLM,
+        TFRoFormerForMultipleChoice,
+        TFRoFormerForQuestionAnswering,
+        TFRoFormerForSequenceClassification,
+        TFRoFormerForTokenClassification,
+        TFRoFormerModel,
+    )
+    from transformers.models.roformer.modeling_tf_roformer import (
+        TFRoFormerSelfAttention,
+        TFRoFormerSinusoidalPositionalEmbedding,
+    )
+
+
+class TFRoFormerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = RoFormerConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            return_dict=True,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRoFormerModel(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_lm_head(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.is_decoder = True
+        model = TFRoFormerForCausalLM(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        prediction_scores = model(inputs)["logits"]
+        self.parent.assertListEqual(
+            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRoFormerForMaskedLM(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFRoFormerForSequenceClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = TFRoFormerForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = TFRoFormerForTokenClassification(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFRoFormerForQuestionAnswering(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+        result = model(inputs)
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFRoFormerModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFRoFormerModel,
+            TFRoFormerForCausalLM,
+            TFRoFormerForMaskedLM,
+            TFRoFormerForQuestionAnswering,
+            TFRoFormerForSequenceClassification,
+            TFRoFormerForTokenClassification,
+            TFRoFormerForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFRoFormerModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RoFormerConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFRoFormerModel.from_pretrained("junnyu/roformer_chinese_base")
+        self.assertIsNotNone(model)
+
+
+@require_tf
+class TFRoFormerModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = TFRoFormerForMaskedLM.from_pretrained("junnyu/roformer_chinese_base")
+        input_ids = tf.constant([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        vocab_size = 50000
+
+        expected_shape = [1, 6, vocab_size]
+        self.assertEqual(output.shape, expected_shape)
+
+        print(output[:, :3, :3])
+
+        # TODO Replace values below with what was printed above.
+        expected_slice = tf.constant(
+            [
+                [
+                    [-0.12053341, -1.0264901, 0.29221946],
+                    [-1.5133783, 0.197433, 0.15190607],
+                    [-5.0135403, -3.900256, -0.84038764],
+                ]
+            ]
+        )
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
+
+
+@require_tf
+class TFRoFormerSinusoidalPositionalEmbeddingTest(unittest.TestCase):
+    tolerance = 1e-4
+
+    def test_basic(self):
+        input_ids = tf.constant([[4, 10]])
+        emb1 = TFRoFormerSinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6)
+
+        emb = emb1(input_ids.shape)
+        desired_weights = tf.constant(
+            [[0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 1.0000], [0.8415, 0.0464, 0.0022, 0.5403, 0.9989, 1.0000]]
+        )
+
+        tf.debugging.assert_near(emb, desired_weights, atol=self.tolerance)
+
+    def test_positional_emb_weights_against_roformer(self):
+
+        desired_weights = tf.constant(
+            [
+                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+                [0.8415, 0.8219, 0.8020, 0.7819, 0.7617],
+                [0.9093, 0.9364, 0.9581, 0.9749, 0.9870],
+            ]
+        )
+        emb1 = TFRoFormerSinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512)
+        emb1([2, 16, 512])
+        weights = emb1.weight[:3, :5]
+
+        tf.debugging.assert_near(weights, desired_weights, atol=self.tolerance)
+
+
+@require_tf
+class TFRoFormerSelfAttentionRotaryPositionEmbeddingTest(unittest.TestCase):
+    tolerance = 1e-4
+
+    def test_apply_rotary_position_embeddings(self):
+        # 2,12,16,64
+        query_layer = tf.reshape(tf.range(2 * 12 * 16 * 64, dtype=tf.float32), shape=(2, 12, 16, 64)) / 100
+
+        key_layer = -tf.reshape(tf.range(2 * 12 * 16 * 64, dtype=tf.float32), shape=(2, 12, 16, 64)) / 100
+
+        embed_positions = TFRoFormerSinusoidalPositionalEmbedding(num_positions=32, embedding_dim=64)
+        sinusoidal_pos = embed_positions([2, 16, 768])[None, None, :, :]
+
+        query_layer, key_layer = TFRoFormerSelfAttention.apply_rotary_position_embeddings(
+            sinusoidal_pos, query_layer, key_layer
+        )
+
+        desired_query_layer = tf.constant(
+            [
+                [0.0000, 0.0100, 0.0200, 0.0300, 0.0400, 0.0500, 0.0600, 0.0700],
+                [-0.2012, 0.8897, 0.0263, 0.9401, 0.2074, 0.9463, 0.3481, 0.9343],
+                [-1.7057, 0.6271, -1.2145, 1.3897, -0.6303, 1.7647, -0.1173, 1.8985],
+                [-2.1731, -1.6397, -2.7358, 0.2854, -2.1840, 1.7183, -1.3018, 2.4871],
+                [0.2717, -3.6173, -2.9206, -2.1988, -3.6638, 0.3858, -2.9155, 2.2980],
+                [3.9859, -2.1580, -0.7984, -4.4904, -4.1181, -2.0252, -4.4782, 1.1253],
+            ]
+        )
+        desired_key_layer = tf.constant(
+            [
+                [0.0000, -0.0100, -0.0200, -0.0300, -0.0400, -0.0500, -0.0600, -0.0700],
+                [0.2012, -0.8897, -0.0263, -0.9401, -0.2074, -0.9463, -0.3481, -0.9343],
+                [1.7057, -0.6271, 1.2145, -1.3897, 0.6303, -1.7647, 0.1173, -1.8985],
+                [2.1731, 1.6397, 2.7358, -0.2854, 2.1840, -1.7183, 1.3018, -2.4871],
+                [-0.2717, 3.6173, 2.9206, 2.1988, 3.6638, -0.3858, 2.9155, -2.2980],
+                [-3.9859, 2.1580, 0.7984, 4.4904, 4.1181, 2.0252, 4.4782, -1.1253],
+            ]
+        )
+
+        tf.debugging.assert_near(query_layer[0, 0, :6, :8], desired_query_layer, atol=self.tolerance)
+        tf.debugging.assert_near(key_layer[0, 0, :6, :8], desired_key_layer, atol=self.tolerance)
diff --git a/test_modeling_tf_t5.py b/test_modeling_tf_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..55f7c862779163d169dcc602224953ff68726d83
--- /dev/null
+++ b/test_modeling_tf_t5.py
@@ -0,0 +1,668 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import T5Config, is_tf_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import ByT5Tokenizer, T5Tokenizer, TFT5EncoderModel, TFT5ForConditionalGeneration, TFT5Model
+
+
+class TFT5ModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.n_positions = 14
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.d_ff = 37
+        self.relative_attention_num_buckets = 8
+        self.dropout_rate = 0.1
+        self.initializer_factor = 0.002
+        self.eos_token_id = 1
+        self.pad_token_id = 0
+        self.scope = None
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        token_labels = None
+        if self.use_labels:
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            n_positions=self.n_positions,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+        )
+
+        return (config, input_ids, input_mask, token_labels)
+
+    def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+        model = TFT5Model(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        result = model(inputs)
+
+        result = model(input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids)
+        decoder_output = result.last_hidden_state
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+        self.parent.assertListEqual(list(encoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
+        self.parent.assertListEqual(list(decoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
+        self.parent.assertEqual(len(decoder_past), 2)
+        # decoder_past[0] should correspond to encoder output
+        self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output)))
+        # There should be `num_layers` key value embeddings stored in decoder_past[1]
+        self.parent.assertEqual(len(decoder_past[1]), config.num_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past[1] tuple
+        self.parent.assertEqual(len(decoder_past[1][0]), 4)
+
+    def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+        model = TFT5ForConditionalGeneration(config=config)
+        inputs_dict = {
+            "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+
+        result = model(inputs_dict)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask):
+        model = TFT5Model(config=config).get_decoder()
+
+        input_ids = input_ids[:1, :]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, use_cache=True)
+
+        outputs_use_cache_conf = model(input_ids)
+        outputs_no_past = model(input_ids, use_cache=False)
+
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+
+        output_from_no_past = model(next_input_ids)[0]
+        output_from_past = model(next_tokens, past_key_values=outputs.past_key_values)[0]
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+    def create_and_check_t5_decoder_model_attention_mask_past(
+        self, config, input_ids, decoder_input_ids, attention_mask
+    ):
+        model = TFT5Model(config=config).get_decoder()
+
+        # create attention mask
+        half_seq_length = self.seq_length // 2
+        attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32)
+        attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32)
+        attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1)
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attn_mask, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size)
+        vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change)
+        condition = tf.transpose(
+            tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))
+        )
+        input_ids = tf.where(condition, random_other_next_tokens, input_ids)
+
+        # append to next input_ids and attn_mask
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        attn_mask = tf.concat(
+            [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)],
+            axis=1,
+        )
+
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0]
+        output_from_past = model(next_tokens, past_key_values=outputs.past_key_values, attention_mask=attn_mask)[0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).numpy().item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+    def create_and_check_t5_decoder_model_past_large_inputs(
+        self, config, input_ids, decoder_input_ids, attention_mask
+    ):
+        model = TFT5Model(config=config).get_decoder()
+
+        input_ids = input_ids[:1, :]
+        attention_mask = attention_mask[:1, :]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(
+            next_tokens, attention_mask=next_attention_mask, past_key_values=outputs.past_key_values
+        )[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids, input_mask, token_labels) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    is_encoder_decoder = True
+    all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
+    all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+
+    def test_t5_model_v1_1(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        config = config_and_inputs[0]
+        config.tie_word_embeddings = False
+        config.feed_forward_proj = "gated-gelu"
+        self.model_tester.create_and_check_t5_model(config, *config_and_inputs[1:])
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+
+    def test_t5_decoder_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_decoder_model_past(*config_and_inputs)
+
+    def test_t5_decoder_model_past_with_attn_mask(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_decoder_model_attention_mask_past(*config_and_inputs)
+
+    def test_t5_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+
+            if model_class in self.all_generative_model_classes:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_saved_model_creation(self):
+        # This test is too long (>30sec) and makes fail the CI
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFT5Model.from_pretrained("t5-small")
+        self.assertIsNotNone(model)
+
+    def test_generate_with_headmasking(self):
+        # TODO: Fix head-masking according to PyTorch T5 model
+        pass
+
+
+class TFT5EncoderOnlyModelTester:
+    def __init__(
+        self,
+        parent,
+        vocab_size=99,
+        batch_size=13,
+        encoder_seq_length=7,
+        # For common tests
+        use_attention_mask=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        d_ff=37,
+        relative_attention_num_buckets=8,
+        is_training=False,
+        dropout_rate=0.1,
+        initializer_factor=0.002,
+        is_encoder_decoder=False,
+        eos_token_id=1,
+        pad_token_id=0,
+        scope=None,
+    ):
+
+        self.parent = parent
+        self.batch_size = batch_size
+        self.encoder_seq_length = encoder_seq_length
+        # For common tests
+        self.seq_length = self.encoder_seq_length
+        self.use_attention_mask = use_attention_mask
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.d_ff = d_ff
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.initializer_factor = initializer_factor
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.is_encoder_decoder = is_encoder_decoder
+        self.scope = None
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+
+        config = T5Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            d_ff=self.d_ff,
+            d_kv=self.hidden_size // self.num_attention_heads,
+            num_layers=self.num_hidden_layers,
+            num_heads=self.num_attention_heads,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            dropout_rate=self.dropout_rate,
+            initializer_factor=self.initializer_factor,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.pad_token_id,
+            pad_token_id=self.pad_token_id,
+            is_encoder_decoder=self.is_encoder_decoder,
+        )
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+        )
+
+    def create_and_check_model(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+    ):
+        model = TFT5EncoderModel(config=config)
+        result = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        result = model(input_ids=input_ids)
+        encoder_output = result.last_hidden_state
+
+        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            attention_mask,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+class TFT5EncoderOnlyModelTest(TFModelTesterMixin, unittest.TestCase):
+    is_encoder_decoder = False
+    all_model_classes = (TFT5EncoderModel,) if is_tf_available() else ()
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFT5EncoderOnlyModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    # is not able to be part of a pipeline
+    def test_train_pipeline_custom_model(self):
+        pass
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFT5ModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def model(self):
+        return TFT5ForConditionalGeneration.from_pretrained("t5-base")
+
+    @slow
+    def test_small_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+
+        >>> path_to_mtf_small_t5_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
+        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
+
+        loss = model(input_ids, labels=labels).loss
+        mtf_score = -tf.math.reduce_sum(loss).numpy()
+
+        EXPECTED_SCORE = -19.0845
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_small_v1_1_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+
+        >>> path_to_mtf_small_t5_v1.1_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_v1.1_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = TFT5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")
+        tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
+        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
+
+        loss = model(input_ids, labels=labels).loss
+        mtf_score = -tf.math.reduce_sum(loss).numpy()
+
+        EXPECTED_SCORE = -59.0293
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_small_byt5_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.9.1
+
+        >>> path_to_byt5_small_checkpoint = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_tf_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = t5.data.ByteVocabulary()
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+
+        model = TFT5ForConditionalGeneration.from_pretrained("google/byt5-small")
+        tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
+
+        input_ids = tokenizer("Hello there", return_tensors="tf").input_ids
+        labels = tokenizer("Hi I am", return_tensors="tf").input_ids
+
+        loss = model(input_ids, labels=labels).loss
+        mtf_score = -tf.math.reduce_sum(loss).numpy()
+
+        EXPECTED_SCORE = -60.7397
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
+
+    @slow
+    def test_summarization(self):
+        model = self.model
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        FRANCE_ARTICLE = 'Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
+
+        SHORTER_ARTICLE = '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+
+        IRAN_ARTICLE = "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+
+        ARTICLE_SUBWAY = 'New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+
+        expected_summaries = [
+            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a cell phone video of the final seconds . "one can hear cries of \'My God\' in several languages," one magazine says .',
+            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a preliminary examination into the situation in the occupied Palestinian territory . as members of the court, Palestinians may be subject to counter-charges as well .",
+            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller: the debate that has already begun since the announcement of the new framework will likely result in more heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and implement a rigorous inspection regime .",
+            'prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two criminal counts of "offering a false instrument for filing in the first degree" she has been married 10 times, with nine of her marriages occurring between 1999 and 2002 .',
+        ]
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        summarization_config = task_specific_config.get("summarization", {})
+        model.config.update(summarization_config)
+
+        dct = tok(
+            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
+            max_length=512,
+            padding="max_length",
+            truncation=True,
+            return_tensors="tf",
+        )
+        self.assertEqual(512, dct["input_ids"].shape[1])
+
+        hypotheses_batch = model.generate(
+            input_ids=dct["input_ids"],
+            attention_mask=dct["attention_mask"],
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=142,
+            min_length=56,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+
+        decoded = [
+            tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in hypotheses_batch
+        ]
+
+        self.assertListEqual(
+            expected_summaries,
+            decoded,
+        )
+
+    @slow
+    def test_translation_en_to_de(self):
+        tok = T5Tokenizer.from_pretrained("t5-base")
+        model = self.model
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_de", {})
+        self.model.config.update(translation_config)
+
+        original_input = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
+        expected_translation = (
+            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
+        )
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=50,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_fr(self):
+        model = self.model
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_fr", {})
+        model.config.update(translation_config)
+
+        en_text = ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots. '
+
+        new_truncated_translation = (
+            "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
+            "un "
+            "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
+            "sous forme "
+            "de points bleus."
+        )
+
+        input_ids = tok(model.config.prefix + en_text, return_tensors="tf").input_ids
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=100,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, new_truncated_translation)
+
+    @slow
+    def test_translation_en_to_ro(self):
+        model = self.model
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_ro", {})
+        model.config.update(translation_config)
+
+        original_input = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
+        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=50,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
diff --git a/test_modeling_tf_transfo_xl.py b/test_modeling_tf_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7b6fc3d9effcde310b34908f5f580b3687e350d
--- /dev/null
+++ b/test_modeling_tf_transfo_xl.py
@@ -0,0 +1,577 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+import unittest
+
+from transformers import TransfoXLConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFTransfoXLForSequenceClassification,
+        TFTransfoXLLMHeadModel,
+        TFTransfoXLModel,
+    )
+
+
+class TFTransfoXLModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.mem_len = 30
+        self.key_length = self.seq_length + self.mem_len
+        self.clamp_len = 15
+        self.is_training = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.cutoffs = [10, 50, 80]
+        self.hidden_size = 32
+        self.d_embed = 32
+        self.num_attention_heads = 4
+        self.d_head = 8
+        self.d_inner = 128
+        self.div_val = 2
+        self.num_hidden_layers = 5
+        self.scope = None
+        self.seed = 1
+        self.eos_token_id = 0
+        self.num_labels = 3
+        self.pad_token_id = self.vocab_size - 1
+        self.init_range = 0.01
+
+    def prepare_config_and_inputs(self):
+        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = TransfoXLConfig(
+            vocab_size=self.vocab_size,
+            mem_len=self.mem_len,
+            clamp_len=self.clamp_len,
+            cutoffs=self.cutoffs,
+            d_model=self.hidden_size,
+            d_embed=self.d_embed,
+            n_head=self.num_attention_heads,
+            d_head=self.d_head,
+            d_inner=self.d_inner,
+            div_val=self.div_val,
+            n_layer=self.num_hidden_layers,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.vocab_size - 1,
+            init_range=self.init_range,
+            num_labels=self.num_labels,
+        )
+
+        return (config, input_ids_1, input_ids_2, lm_labels)
+
+    def set_seed(self):
+        random.seed(self.seed)
+        tf.random.set_seed(self.seed)
+
+    def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TFTransfoXLModel(config)
+
+        hidden_states_1, mems_1 = model(input_ids_1).to_tuple()
+
+        inputs = {"input_ids": input_ids_2, "mems": mems_1}
+
+        hidden_states_2, mems_2 = model(inputs).to_tuple()
+
+        self.parent.assertEqual(hidden_states_1.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(hidden_states_2.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in mems_1],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+        self.parent.assertListEqual(
+            [mem.shape for mem in mems_2],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TFTransfoXLLMHeadModel(config)
+
+        lm_logits_1, mems_1 = model(input_ids_1).to_tuple()
+
+        inputs = {"input_ids": input_ids_1, "labels": lm_labels}
+        _, mems_1 = model(inputs).to_tuple()
+
+        lm_logits_2, mems_2 = model([input_ids_2, mems_1]).to_tuple()
+
+        inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
+
+        _, mems_2 = model(inputs).to_tuple()
+
+        self.parent.assertEqual(lm_logits_1.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in mems_1],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+        self.parent.assertEqual(lm_logits_2.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in mems_2],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_transfo_xl_for_sequence_classification(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TFTransfoXLForSequenceClassification(config)
+        result = model(input_ids_1)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids_1}
+        return config, inputs_dict
+
+
+@require_tf
+class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (TFTransfoXLModel, TFTransfoXLLMHeadModel, TFTransfoXLForSequenceClassification) if is_tf_available() else ()
+    )
+    all_generative_model_classes = () if is_tf_available() else ()
+    # TODO: add this test when TFTransfoXLLMHead has a linear output layer implemented
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFTransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_model(*config_and_inputs)
+
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
+
+    def test_transfo_xl_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_for_sequence_classification(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        list_other_models_with_output_ebd = [TFTransfoXLForSequenceClassification]
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+            if model_class in list_other_models_with_output_ebd:
+                x = model.get_output_embeddings()
+                assert isinstance(x, tf.keras.layers.Layer)
+                name = model.get_bias()
+                assert name is None
+            else:
+                x = model.get_output_embeddings()
+                assert x is None
+                name = model.get_bias()
+                assert name is None
+
+    def test_xla_mode(self):
+        # TODO JP: Make TransfoXL XLA compliant
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFTransfoXLModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFTransfoXLModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_transfo_xl_wt103(self):
+        model = TFTransfoXLLMHeadModel.from_pretrained("transfo-xl-wt103")
+        input_ids = tf.convert_to_tensor(
+            [
+                [
+                    33,
+                    1297,
+                    2,
+                    1,
+                    1009,
+                    4,
+                    1109,
+                    11739,
+                    4762,
+                    358,
+                    5,
+                    25,
+                    245,
+                    22,
+                    1706,
+                    17,
+                    20098,
+                    5,
+                    3215,
+                    21,
+                    37,
+                    1110,
+                    3,
+                    13,
+                    1041,
+                    4,
+                    24,
+                    603,
+                    490,
+                    2,
+                    71477,
+                    20098,
+                    104447,
+                    2,
+                    20961,
+                    1,
+                    2604,
+                    4,
+                    1,
+                    329,
+                    3,
+                    6224,
+                    831,
+                    16002,
+                    2,
+                    8,
+                    603,
+                    78967,
+                    29546,
+                    23,
+                    803,
+                    20,
+                    25,
+                    416,
+                    5,
+                    8,
+                    232,
+                    4,
+                    277,
+                    6,
+                    1855,
+                    4601,
+                    3,
+                    29546,
+                    54,
+                    8,
+                    3609,
+                    5,
+                    57211,
+                    49,
+                    4,
+                    1,
+                    277,
+                    18,
+                    8,
+                    1755,
+                    15691,
+                    3,
+                    341,
+                    25,
+                    416,
+                    693,
+                    42573,
+                    71,
+                    17,
+                    401,
+                    94,
+                    31,
+                    17919,
+                    2,
+                    29546,
+                    7873,
+                    18,
+                    1,
+                    435,
+                    23,
+                    11011,
+                    755,
+                    5,
+                    5167,
+                    3,
+                    7983,
+                    98,
+                    84,
+                    2,
+                    29546,
+                    3267,
+                    8,
+                    3609,
+                    4,
+                    1,
+                    4865,
+                    1075,
+                    2,
+                    6087,
+                    71,
+                    6,
+                    346,
+                    8,
+                    5854,
+                    3,
+                    29546,
+                    824,
+                    1400,
+                    1868,
+                    2,
+                    19,
+                    160,
+                    2,
+                    311,
+                    8,
+                    5496,
+                    2,
+                    20920,
+                    17,
+                    25,
+                    15097,
+                    3,
+                    24,
+                    24,
+                    0,
+                ]
+            ],
+            dtype=tf.int32,
+        )
+        #  In 1991 , the remains of Russian Tsar Nicholas II and his family
+        #  ( except for Alexei and Maria ) are discovered .
+        #  The voice of Nicholas's young son , Tsarevich Alexei Nikolaevich , narrates the
+        #  remainder of the story . 1883 Western Siberia ,
+        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic .
+        #  Rasputin has a vision and denounces one of the men as a horse thief . Although his
+        #  father initially slaps him for making such an accusation , Rasputin watches as the
+        #  man is chased outside and beaten . Twenty years later , Rasputin sees a vision of
+        #  the Virgin Mary , prompting him to become a priest . Rasputin quickly becomes famous ,
+        #  with people , even a bishop , begging for his blessing . <eod> </s> <eos>
+
+        expected_output_ids = [
+            33,
+            1297,
+            2,
+            1,
+            1009,
+            4,
+            1109,
+            11739,
+            4762,
+            358,
+            5,
+            25,
+            245,
+            22,
+            1706,
+            17,
+            20098,
+            5,
+            3215,
+            21,
+            37,
+            1110,
+            3,
+            13,
+            1041,
+            4,
+            24,
+            603,
+            490,
+            2,
+            71477,
+            20098,
+            104447,
+            2,
+            20961,
+            1,
+            2604,
+            4,
+            1,
+            329,
+            3,
+            6224,
+            831,
+            16002,
+            2,
+            8,
+            603,
+            78967,
+            29546,
+            23,
+            803,
+            20,
+            25,
+            416,
+            5,
+            8,
+            232,
+            4,
+            277,
+            6,
+            1855,
+            4601,
+            3,
+            29546,
+            54,
+            8,
+            3609,
+            5,
+            57211,
+            49,
+            4,
+            1,
+            277,
+            18,
+            8,
+            1755,
+            15691,
+            3,
+            341,
+            25,
+            416,
+            693,
+            42573,
+            71,
+            17,
+            401,
+            94,
+            31,
+            17919,
+            2,
+            29546,
+            7873,
+            18,
+            1,
+            435,
+            23,
+            11011,
+            755,
+            5,
+            5167,
+            3,
+            7983,
+            98,
+            84,
+            2,
+            29546,
+            3267,
+            8,
+            3609,
+            4,
+            1,
+            4865,
+            1075,
+            2,
+            6087,
+            71,
+            6,
+            346,
+            8,
+            5854,
+            3,
+            29546,
+            824,
+            1400,
+            1868,
+            2,
+            19,
+            160,
+            2,
+            311,
+            8,
+            5496,
+            2,
+            20920,
+            17,
+            25,
+            15097,
+            3,
+            24,
+            24,
+            0,
+            33,
+            1,
+            1857,
+            2,
+            1,
+            1009,
+            4,
+            1109,
+            11739,
+            4762,
+            358,
+            5,
+            25,
+            245,
+            28,
+            1110,
+            3,
+            13,
+            1041,
+            4,
+            24,
+            603,
+            490,
+            2,
+            71477,
+            20098,
+            104447,
+            2,
+            20961,
+            1,
+            2604,
+            4,
+            1,
+            329,
+            3,
+            0,
+        ]
+        #  In 1991, the remains of Russian Tsar Nicholas II and his family (
+        #  except for Alexei and Maria ) are discovered. The voice of young son,
+        #  Tsarevich Alexei Nikolaevich, narrates the remainder of the story.
+        #  1883 Western Siberia, a young Grigori Rasputin is asked by his father
+        #  and a group of men to perform magic. Rasputin has a vision and
+        #  denounces one of the men as a horse thief. Although his father initially
+        #  slaps him for making such an accusation, Rasputin watches as the man
+        #  is chased outside and beaten. Twenty years later, Rasputin sees a vision
+        #  of the Virgin Mary, prompting him to become a priest.
+        #  Rasputin quickly becomes famous, with people, even a bishop, begging for
+        #  his blessing. <unk> <unk> <eos> In the 1990s, the remains of Russian Tsar
+        # Nicholas II and his family were discovered. The voice of <unk> young son,
+        # Tsarevich Alexei Nikolaevich, narrates the remainder of the story.<eos>
+
+        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_tf_wav2vec2.py b/test_modeling_tf_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..47c378cc88b58fc681f9b78ddac66b4923dfabc7
--- /dev/null
+++ b/test_modeling_tf_wav2vec2.py
@@ -0,0 +1,539 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import copy
+import inspect
+import math
+import unittest
+
+import numpy as np
+
+from transformers import Wav2Vec2Config, is_tf_available
+from transformers.testing_utils import require_datasets, require_soundfile, require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TFWav2Vec2ForCTC, TFWav2Vec2Model, Wav2Vec2Processor
+    from transformers.models.wav2vec2.modeling_tf_wav2vec2 import _compute_mask_indices
+
+
+@require_tf
+class TFWav2Vec2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,
+        is_training=False,
+        hidden_size=16,
+        feat_extract_norm="group",
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        vocab_size=32,
+        do_stable_layer_norm=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.scope = scope
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+    def prepare_config_and_inputs(self):
+        input_values = tf.cast(ids_tensor([self.batch_size, self.seq_length], 32768), tf.float32) / 32768.0
+        attention_mask = tf.ones_like(input_values)
+
+        config = Wav2Vec2Config(
+            hidden_size=self.hidden_size,
+            feat_extract_norm=self.feat_extract_norm,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            do_stable_layer_norm=self.do_stable_layer_norm,
+        )
+
+        return config, input_values, attention_mask
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = TFWav2Vec2Model(config)
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        config.layerdrop = 0.0
+        model = TFWav2Vec2Model(config)
+
+        input_values = input_values[:3]
+        attention_mask = tf.ones_like(input_values)
+
+        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
+        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
+
+        # convert values that are over input_lengths to padding
+        input_values = input_values * length_mask
+        attention_mask = attention_mask * length_mask
+
+        batch_outputs = model(input_values, attention_mask=attention_mask, training=False).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice, training=False).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(np.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = TFWav2Vec2ForCTC(config)
+
+        input_values = input_values[:3]
+        attention_mask = tf.ones_like(input_values)
+
+        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
+        max_length_labels = model.wav2vec2._get_feat_extract_output_lengths(input_lengths)
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
+
+        # convert values that are over input_lengths to padding
+        input_values = input_values * length_mask
+        attention_mask = attention_mask * length_mask
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
+
+        self.parent.assertTrue(abs(labels.shape[0] * mean_loss - sum_loss) < 1e-2)
+
+    def check_training(self, config, input_values, *args):
+        model = TFWav2Vec2ForCTC(config)
+
+        # freeze feature encoder
+        model.freeze_feature_extractor()
+
+        input_values = input_values[:3]
+
+        input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]])
+        max_length_labels = model.wav2vec2._get_feat_extract_output_lengths(input_lengths)
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32)
+
+        input_values = input_values * length_mask
+
+        pad_size = max(max_length_labels) - labels.shape[1]
+        labels = tf.pad(labels, ((0, 0), (0, pad_size)), constant_values=-100)
+
+        loss = model(input_values, labels=labels, training=True).loss
+
+        self.parent.assertFalse(tf.math.is_inf(loss))
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFWav2Vec2ModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (TFWav2Vec2Model, TFWav2Vec2ForCTC) if is_tf_available() else ()
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFWav2Vec2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite because input_values != input_ids
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    # overwrite because input_values != input_ids
+    def test_keyword_and_dict_args(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            outputs_dict = model(inputs)
+
+            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            input_values = inputs_keywords.pop("input_values", None)
+            outputs_keywords = model(input_values, **inputs_keywords)
+            output_dict = outputs_dict[0].numpy()
+            output_keywords = outputs_keywords[0].numpy()
+
+            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_hidden_states_output(config, inputs_dict, model_class):
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+
+            hidden_states = outputs.hidden_states
+            self.assertEqual(config.output_attentions, False)
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.output_seq_length, self.model_tester.hidden_size],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_training(*config_and_inputs)
+
+    # Wav2Vec2 has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # Wav2Vec2 cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Wav2Vec2 has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        self.assertIsNotNone(model)
+
+
+@require_tf
+class TFWav2Vec2RobustModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFWav2Vec2Model, TFWav2Vec2ForCTC) if is_tf_available() else ()
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFWav2Vec2ModelTester(
+            self,
+            conv_stride=(3, 3, 3),
+            feat_extract_norm="layer",
+            do_stable_layer_norm=True,
+            scope="robust",
+        )
+        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
+
+    # overwrite because input_values != input_ids
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    # overwrite because input_values != input_ids
+    def test_keyword_and_dict_args(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            outputs_dict = model(inputs)
+
+            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            input_values = inputs_keywords.pop("input_values", None)
+            outputs_keywords = model(input_values, **inputs_keywords)
+            output_dict = outputs_dict[0].numpy()
+            output_keywords = outputs_keywords[0].numpy()
+
+            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_hidden_states_output(config, inputs_dict, model_class):
+            model = model_class(config)
+            outputs = model(self._prepare_for_class(inputs_dict, model_class))
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+
+            hidden_states = outputs.hidden_states
+            self.assertEqual(config.output_attentions, False)
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.output_seq_length, self.model_tester.hidden_size],
+            )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+
+    def test_batched_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_training(*config_and_inputs)
+
+    # Wav2Vec2 has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # Wav2Vec2 cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Wav2Vec2 has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        self.assertIsNotNone(model)
+
+
+@require_tf
+class TFWav2Vec2UtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+
+        self.assertListEqual(
+            tf.reduce_sum(mask, -1).numpy().tolist(), [mask_prob * sequence_length for _ in range(batch_size)]
+        )
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in tf.reduce_sum(mask, -1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+
+@require_tf
+@slow
+@require_datasets
+@require_soundfile
+class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        import soundfile as sf
+
+        ids = [f"1272-141231-000{i}" for i in range(num_samples)]
+
+        # map files to raw
+        def map_to_array(batch):
+            speech, _ = sf.read(batch["file"])
+            batch["speech"] = speech
+            return batch
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+
+        ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array)
+
+        return ds["speech"][:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="tf", sampling_rate=16000).input_values
+
+        logits = model(input_values).logits
+
+        predicted_ids = tf.argmax(logits, axis=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_normal_batched(self):
+        model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
+
+        input_speech = self._load_datasamples(2)
+
+        input_values = processor(
+            input_speech, return_tensors="tf", padding=True, truncation=True, sampling_rate=16000
+        ).input_values
+
+        logits = model(input_values).logits
+
+        predicted_ids = tf.argmax(logits, axis=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_robust_batched(self):
+        model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
+        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="tf", padding=True, truncation=True)
+
+        input_values = inputs.input_values
+        attention_mask = inputs.attention_mask
+
+        logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = tf.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/test_modeling_tf_xlm.py b/test_modeling_tf_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dc1f0d46312cf3154948905e3878f8e1ba60b8
--- /dev/null
+++ b/test_modeling_tf_xlm.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFXLMForMultipleChoice,
+        TFXLMForQuestionAnsweringSimple,
+        TFXLMForSequenceClassification,
+        TFXLMForTokenClassification,
+        TFXLMModel,
+        TFXLMWithLMHeadModel,
+        XLMConfig,
+    )
+
+
+class TFXLMModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_lengths = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.gelu_activation = True
+        self.sinusoidal_embeddings = False
+        self.causal = False
+        self.asm = False
+        self.n_langs = 2
+        self.vocab_size = 99
+        self.n_special = 0
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.summary_type = "last"
+        self.use_proj = True
+        self.scope = None
+        self.bos_token_id = 0
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+
+        input_lengths = None
+        if self.use_input_lengths:
+            input_lengths = (
+                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+            )  # small variation of seq_length
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+        sequence_labels = None
+        token_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = XLMConfig(
+            vocab_size=self.vocab_size,
+            n_special=self.n_special,
+            emb_dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            gelu_activation=self.gelu_activation,
+            sinusoidal_embeddings=self.sinusoidal_embeddings,
+            asm=self.asm,
+            causal=self.causal,
+            n_langs=self.n_langs,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            summary_type=self.summary_type,
+            use_proj=self.use_proj,
+            bos_token_id=self.bos_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        )
+
+    def create_and_check_xlm_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFXLMModel(config=config)
+        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
+        result = model(inputs)
+
+        inputs = [input_ids, input_mask]
+        result = model(inputs)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_xlm_lm_head(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFXLMWithLMHeadModel(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids}
+        outputs = model(inputs)
+
+        result = outputs
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_xlm_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFXLMForQuestionAnsweringSimple(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths}
+
+        result = model(inputs)
+
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_xlm_sequence_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = TFXLMForSequenceClassification(config)
+
+        inputs = {"input_ids": input_ids, "lengths": input_lengths}
+
+        result = model(inputs)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def create_and_check_xlm_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFXLMForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_xlm_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = TFXLMForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "langs": token_type_ids,
+            "lengths": input_lengths,
+        }
+        return config, inputs_dict
+
+
+@require_tf
+class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFXLMModel,
+            TFXLMWithLMHeadModel,
+            TFXLMForSequenceClassification,
+            TFXLMForQuestionAnsweringSimple,
+            TFXLMForTokenClassification,
+            TFXLMForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (TFXLMWithLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFXLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
+
+    def test_xlm_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
+
+    def test_xlm_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
+
+    def test_xlm_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFXLMModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFXLMModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_xlm_mlm_en_2048(self):
+        model = TFXLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
+        input_ids = tf.convert_to_tensor([[14, 447]], dtype=tf.int32)  # the president
+        expected_output_ids = [
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+        ]  # the president the president the president the president the president the president the president the president the president the president
+        # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_tf_xlm_roberta.py b/test_modeling_tf_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..695a403b7b0bb097a2925f979ed1c5b102e66832
--- /dev/null
+++ b/test_modeling_tf_xlm_roberta.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+
+
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+
+    from transformers import TFXLMRobertaModel
+
+
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFFlaubertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
+
+        features = {
+            "input_ids": tf.convert_to_tensor([[0, 2646, 10269, 83, 99942, 2]], dtype=tf.int32),  # "My dog is cute"
+            "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
+        }
+
+        output = model(features)["last_hidden_state"]
+        expected_shape = tf.TensorShape((1, 6, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [
+                [
+                    [0.0681762, 0.10894451, 0.06772504],
+                    [-0.06423668, 0.02366615, 0.04329344],
+                    [-0.06057295, 0.09974135, -0.00070584],
+                ]
+            ],
+            dtype=tf.float32,
+        )
+
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/test_modeling_tf_xlnet.py b/test_modeling_tf_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..51fba4575fe0fd9174c4f04e61b4d757d9af4c89
--- /dev/null
+++ b/test_modeling_tf_xlnet.py
@@ -0,0 +1,795 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+import unittest
+
+from transformers import XLNetConfig, is_tf_available
+from transformers.testing_utils import require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.xlnet.modeling_tf_xlnet import (
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFXLNetForMultipleChoice,
+        TFXLNetForQuestionAnsweringSimple,
+        TFXLNetForSequenceClassification,
+        TFXLNetForTokenClassification,
+        TFXLNetLMHeadModel,
+        TFXLNetModel,
+    )
+
+
+class TFXLNetModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.mem_len = 10
+        # self.key_len = seq_length + mem_len
+        self.clamp_len = -1
+        self.reuse_len = 15
+        self.is_training = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.cutoffs = [10, 50, 80]
+        self.hidden_size = 32
+        self.num_attention_heads = 4
+        self.d_inner = 128
+        self.num_hidden_layers = 5
+        self.type_sequence_label_size = 2
+        self.untie_r = True
+        self.bi_data = False
+        self.same_length = False
+        self.initializer_range = 0.05
+        self.seed = 1
+        self.type_vocab_size = 2
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.pad_token_id = 5
+        self.num_choices = 4
+
+    def prepare_config_and_inputs(self):
+        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32)
+
+        input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+        perm_mask = tf.zeros((self.batch_size, self.seq_length + 1, self.seq_length), dtype=tf.float32)
+        perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
+        perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
+        # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
+        target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
+        target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
+        # target_mapping[:, 0, -1] = 1.0  # predict last token
+
+        sequence_labels = None
+        lm_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
+
+        config = XLNetConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            n_head=self.num_attention_heads,
+            d_inner=self.d_inner,
+            n_layer=self.num_hidden_layers,
+            untie_r=self.untie_r,
+            mem_len=self.mem_len,
+            clamp_len=self.clamp_len,
+            same_length=self.same_length,
+            reuse_len=self.reuse_len,
+            bi_data=self.bi_data,
+            initializer_range=self.initializer_range,
+            num_labels=self.type_sequence_label_size,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+        return (
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        )
+
+    def set_seed(self):
+        random.seed(self.seed)
+        tf.random.set_seed(self.seed)
+
+    def create_and_check_xlnet_base_model(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+    ):
+        model = TFXLNetModel(config)
+
+        inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids}
+        result = model(inputs)
+
+        inputs = [input_ids_1, input_mask]
+        result = model(inputs)
+
+        config.use_mems_eval = False
+        model = TFXLNetModel(config)
+        no_mems_outputs = model(inputs)
+        self.parent.assertEqual(len(no_mems_outputs), 1)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_lm_head(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+    ):
+        model = TFXLNetLMHeadModel(config)
+
+        inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids}
+        all_logits_1, mems_1 = model(inputs_1).to_tuple()
+
+        inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids}
+        all_logits_2, mems_2 = model(inputs_2).to_tuple()
+
+        inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping}
+        logits, _ = model(inputs_3).to_tuple()
+
+        self.parent.assertEqual(all_logits_1.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in mems_1],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+        self.parent.assertEqual(all_logits_2.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in mems_2],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_qa(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+    ):
+        model = TFXLNetForQuestionAnsweringSimple(config)
+
+        inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids}
+        result = model(inputs)
+
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_sequence_classif(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+    ):
+        model = TFXLNetForSequenceClassification(config)
+
+        result = model(input_ids_1)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_for_token_classification(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+    ):
+        config.num_labels = input_ids_1.shape[1]
+        model = TFXLNetForTokenClassification(config)
+        inputs = {
+            "input_ids": input_ids_1,
+            "attention_mask": input_mask,
+            # 'token_type_ids': token_type_ids
+        }
+        result = model(inputs)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, config.num_labels))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_for_multiple_choice(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+    ):
+        config.num_choices = self.num_choices
+        model = TFXLNetForMultipleChoice(config=config)
+        multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids_1, 1), (1, self.num_choices, 1))
+        multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+        multiple_choice_token_type_ids = tf.tile(tf.expand_dims(segment_ids, 1), (1, self.num_choices, 1))
+        inputs = {
+            "input_ids": multiple_choice_inputs_ids,
+            "attention_mask": multiple_choice_input_mask,
+            "token_type_ids": multiple_choice_token_type_ids,
+        }
+        result = model(inputs)
+
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size * self.num_choices, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids_1}
+        return config, inputs_dict
+
+
+@require_tf
+class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            TFXLNetModel,
+            TFXLNetLMHeadModel,
+            TFXLNetForSequenceClassification,
+            TFXLNetForTokenClassification,
+            TFXLNetForQuestionAnsweringSimple,
+            TFXLNetForMultipleChoice,
+        )
+        if is_tf_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (TFXLNetLMHeadModel,) if is_tf_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFXLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
+
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+
+    def test_xlnet_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_for_token_classification(*config_and_inputs)
+
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
+
+    def test_xlnet_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_for_multiple_choice(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFXLNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_tf
+class TFXLNetModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_xlnet_base_cased(self):
+        model = TFXLNetLMHeadModel.from_pretrained("xlnet-base-cased")
+        input_ids = tf.convert_to_tensor(
+            [
+                [
+                    67,
+                    2840,
+                    19,
+                    18,
+                    1484,
+                    20,
+                    965,
+                    29077,
+                    8719,
+                    1273,
+                    21,
+                    45,
+                    273,
+                    17,
+                    10,
+                    15048,
+                    28,
+                    27511,
+                    21,
+                    4185,
+                    11,
+                    41,
+                    2444,
+                    9,
+                    32,
+                    1025,
+                    20,
+                    8719,
+                    26,
+                    23,
+                    673,
+                    966,
+                    19,
+                    29077,
+                    20643,
+                    27511,
+                    20822,
+                    20643,
+                    19,
+                    17,
+                    6616,
+                    17511,
+                    18,
+                    8978,
+                    20,
+                    18,
+                    777,
+                    9,
+                    19233,
+                    1527,
+                    17669,
+                    19,
+                    24,
+                    673,
+                    17,
+                    28756,
+                    150,
+                    12943,
+                    4354,
+                    153,
+                    27,
+                    442,
+                    37,
+                    45,
+                    668,
+                    21,
+                    24,
+                    256,
+                    20,
+                    416,
+                    22,
+                    2771,
+                    4901,
+                    9,
+                    12943,
+                    4354,
+                    153,
+                    51,
+                    24,
+                    3004,
+                    21,
+                    28142,
+                    23,
+                    65,
+                    20,
+                    18,
+                    416,
+                    34,
+                    24,
+                    2958,
+                    22947,
+                    9,
+                    1177,
+                    45,
+                    668,
+                    3097,
+                    13768,
+                    23,
+                    103,
+                    28,
+                    441,
+                    148,
+                    48,
+                    20522,
+                    19,
+                    12943,
+                    4354,
+                    153,
+                    12860,
+                    34,
+                    18,
+                    326,
+                    27,
+                    17492,
+                    684,
+                    21,
+                    6709,
+                    9,
+                    8585,
+                    123,
+                    266,
+                    19,
+                    12943,
+                    4354,
+                    153,
+                    6872,
+                    24,
+                    3004,
+                    20,
+                    18,
+                    9225,
+                    2198,
+                    19,
+                    12717,
+                    103,
+                    22,
+                    401,
+                    24,
+                    6348,
+                    9,
+                    12943,
+                    4354,
+                    153,
+                    1068,
+                    2768,
+                    2286,
+                    19,
+                    33,
+                    104,
+                    19,
+                    176,
+                    24,
+                    9313,
+                    19,
+                    20086,
+                    28,
+                    45,
+                    10292,
+                    9,
+                    4,
+                    3,
+                ]
+            ],
+            dtype=tf.int32,
+        )
+        #  In 1991, the remains of Russian Tsar Nicholas II and his family
+        #  (except for Alexei and Maria) are discovered.
+        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+        #  remainder of the story. 1883 Western Siberia,
+        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
+        #  father initially slaps him for making such an accusation, Rasputin watches as the
+        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+        #  with people, even a bishop, begging for his blessing. """
+
+        expected_output_ids = [
+            67,
+            2840,
+            19,
+            18,
+            1484,
+            20,
+            965,
+            29077,
+            8719,
+            1273,
+            21,
+            45,
+            273,
+            17,
+            10,
+            15048,
+            28,
+            27511,
+            21,
+            4185,
+            11,
+            41,
+            2444,
+            9,
+            32,
+            1025,
+            20,
+            8719,
+            26,
+            23,
+            673,
+            966,
+            19,
+            29077,
+            20643,
+            27511,
+            20822,
+            20643,
+            19,
+            17,
+            6616,
+            17511,
+            18,
+            8978,
+            20,
+            18,
+            777,
+            9,
+            19233,
+            1527,
+            17669,
+            19,
+            24,
+            673,
+            17,
+            28756,
+            150,
+            12943,
+            4354,
+            153,
+            27,
+            442,
+            37,
+            45,
+            668,
+            21,
+            24,
+            256,
+            20,
+            416,
+            22,
+            2771,
+            4901,
+            9,
+            12943,
+            4354,
+            153,
+            51,
+            24,
+            3004,
+            21,
+            28142,
+            23,
+            65,
+            20,
+            18,
+            416,
+            34,
+            24,
+            2958,
+            22947,
+            9,
+            1177,
+            45,
+            668,
+            3097,
+            13768,
+            23,
+            103,
+            28,
+            441,
+            148,
+            48,
+            20522,
+            19,
+            12943,
+            4354,
+            153,
+            12860,
+            34,
+            18,
+            326,
+            27,
+            17492,
+            684,
+            21,
+            6709,
+            9,
+            8585,
+            123,
+            266,
+            19,
+            12943,
+            4354,
+            153,
+            6872,
+            24,
+            3004,
+            20,
+            18,
+            9225,
+            2198,
+            19,
+            12717,
+            103,
+            22,
+            401,
+            24,
+            6348,
+            9,
+            12943,
+            4354,
+            153,
+            1068,
+            2768,
+            2286,
+            19,
+            33,
+            104,
+            19,
+            176,
+            24,
+            9313,
+            19,
+            20086,
+            28,
+            45,
+            10292,
+            9,
+            4,
+            3,
+            19,
+            12943,
+            4354,
+            153,
+            27,
+            442,
+            22,
+            2771,
+            4901,
+            9,
+            69,
+            27,
+            50,
+            551,
+            22,
+            2771,
+            4901,
+            19,
+            21,
+            45,
+            668,
+            21,
+            18,
+            416,
+            41,
+            1499,
+            22,
+            755,
+            18,
+            14285,
+            9,
+            12943,
+            4354,
+            153,
+            27,
+            1499,
+            22,
+            642,
+            22,
+        ]
+        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
+        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
+        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
+        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
+        #  denounces one of the men as a horse thief. Although his father initially slaps
+        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
+        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
+        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
+        #  <sep><cls>, Rasputin is asked to perform magic.
+        #  He is not able to perform magic, and his father and
+        # the men are forced to leave the monastery. Rasputin is forced to return to
+
+        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
+
+        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_transfo_xl.py b/test_modeling_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b01e638d97523f3257980c74718dd941821a00
--- /dev/null
+++ b/test_modeling_transfo_xl.py
@@ -0,0 +1,761 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import random
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import TransfoXLConfig, TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
+    from transformers.models.transfo_xl.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class TransfoXLModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 14
+        self.seq_length = 7
+        self.mem_len = 30
+        self.key_length = self.seq_length + self.mem_len
+        self.clamp_len = 15
+        self.is_training = False
+        self.use_labels = True
+        self.vocab_size = 99
+        self.cutoffs = [10, 50, 80]
+        self.hidden_size = 32
+        self.d_embed = 32
+        self.num_attention_heads = 4
+        self.d_head = 8
+        self.d_inner = 128
+        self.div_val = 2
+        self.num_hidden_layers = 5
+        self.scope = None
+        self.seed = 1
+        self.eos_token_id = 0
+        self.num_labels = 3
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        lm_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = TransfoXLConfig(
+            vocab_size=self.vocab_size,
+            mem_len=self.mem_len,
+            clamp_len=self.clamp_len,
+            cutoffs=self.cutoffs,
+            d_model=self.hidden_size,
+            d_embed=self.d_embed,
+            n_head=self.num_attention_heads,
+            d_head=self.d_head,
+            d_inner=self.d_inner,
+            div_val=self.div_val,
+            n_layer=self.num_hidden_layers,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+
+        return (config, input_ids_1, input_ids_2, lm_labels)
+
+    def set_seed(self):
+        random.seed(self.seed)
+        torch.manual_seed(self.seed)
+
+    def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TransfoXLModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs1 = model(input_ids_1)
+        outputs2 = model(input_ids_2, outputs1["mems"])
+        outputs = {
+            "hidden_states_1": outputs1["last_hidden_state"],
+            "mems_1": outputs1["mems"],
+            "hidden_states_2": outputs2["last_hidden_state"],
+            "mems_2": outputs2["mems"],
+        }
+        return outputs
+
+    def check_transfo_xl_model_output(self, result):
+        self.parent.assertEqual(result["hidden_states_1"].shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result["hidden_states_2"].shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result["mems_1"]],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+        self.parent.assertListEqual(
+            [mem.shape for mem in result["mems_2"]],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+        model = TransfoXLLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        lm_logits_1 = model(input_ids_1)["prediction_scores"]
+        outputs1 = model(input_ids_1, labels=lm_labels)
+        lm_logits_2 = model(input_ids_2, mems=outputs1["mems"])["prediction_scores"]
+        outputs2 = model(input_ids_2, labels=lm_labels, mems=outputs1["mems"])
+
+        outputs = {
+            "loss_1": outputs1["losses"],
+            "mems_1": outputs1["mems"],
+            "lm_logits_1": lm_logits_1,
+            "loss_2": outputs2["losses"],
+            "mems_2": outputs2["mems"],
+            "lm_logits_2": lm_logits_2,
+        }
+        return outputs
+
+    def check_transfo_xl_lm_head_output(self, result):
+        self.parent.assertEqual(result["loss_1"].shape, (self.batch_size, self.seq_length - 1))
+        self.parent.assertEqual(result["lm_logits_1"].shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result["mems_1"]],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+        self.parent.assertEqual(result["loss_2"].shape, (self.batch_size, self.seq_length - 1))
+        self.parent.assertEqual(result["lm_logits_2"].shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result["mems_2"]],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_transfo_xl_for_sequence_classification(self, config, input_ids_1, input_ids_2, lm_labels):
+        config.num_labels = self.num_labels
+        model = TransfoXLForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids_1)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids_1}
+        return config, inputs_dict
+
+
+@require_torch
+class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TransfoXLModel, TransfoXLLMHeadModel, TransfoXLForSequenceClassification) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (TransfoXLLMHeadModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = True
+
+    def check_cutoffs_and_n_token(
+        self, copied_cutoffs, layer, model_embed, model, model_class, resized_value, vocab_size
+    ):
+        # Check that the cutoffs were modified accordingly
+        for i in range(len(copied_cutoffs)):
+            if i < layer:
+                self.assertEqual(model_embed.cutoffs[i], copied_cutoffs[i])
+                if model_class == TransfoXLLMHeadModel:
+                    self.assertEqual(model.crit.cutoffs[i], copied_cutoffs[i])
+                if i < len(model.config.cutoffs):
+                    self.assertEqual(model.config.cutoffs[i], copied_cutoffs[i])
+            else:
+                self.assertEqual(model_embed.cutoffs[i], copied_cutoffs[i] + resized_value)
+                if model_class == TransfoXLLMHeadModel:
+                    self.assertEqual(model.crit.cutoffs[i], copied_cutoffs[i] + resized_value)
+                if i < len(model.config.cutoffs):
+                    self.assertEqual(model.config.cutoffs[i], copied_cutoffs[i] + resized_value)
+
+        self.assertEqual(model_embed.n_token, vocab_size + resized_value)
+        if model_class == TransfoXLLMHeadModel:
+            self.assertEqual(model.crit.n_token, vocab_size + resized_value)
+
+    def setUp(self):
+        self.model_tester = TransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
+        self.model_tester.check_transfo_xl_model_output(output_result)
+
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
+        self.model_tester.check_transfo_xl_lm_head_output(output_result)
+
+    def test_transfo_xl_sequence_classification_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_transfo_xl_for_sequence_classification(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # xlnet cannot keep gradients in attentions or hidden states
+        return
+
+    @require_torch_multi_gpu
+    def test_multi_gpu_data_parallel_forward(self):
+        # Opt-out of this test.
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TransfoXLModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_resize_tokens_embeddings(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = [emb.weight.clone() for emb in model_embed.emb_layers]
+            # Retrieve the cutoffs and copy them
+            copied_cutoffs = copy.copy(model_embed.cutoffs)
+
+            test_layers = [x for x in range(config.div_val)]
+            for layer in test_layers:
+                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+                model_embed = model.resize_token_embeddings(model_vocab_size + 10, layer)
+                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+                # Check that it actually resizes the embeddings matrix
+                self.assertEqual(model_embed.emb_layers[layer].weight.shape[0], cloned_embeddings[layer].shape[0] + 10)
+                # Check that the cutoffs were modified accordingly
+                self.check_cutoffs_and_n_token(
+                    copied_cutoffs, layer, model_embed, model, model_class, 10, model_vocab_size
+                )
+
+                # Check that the model can still do a forward pass successfully (every parameter should be resized)
+                model(**inputs_dict)
+
+                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+                model_embed = model.resize_token_embeddings(model_vocab_size - 5, layer)
+                self.assertEqual(model.config.vocab_size, model_vocab_size - 5)
+                # Check that it actually resizes the embeddings matrix
+                self.assertEqual(model_embed.emb_layers[layer].weight.shape[0], cloned_embeddings[layer].shape[0] - 5)
+                # Check that the cutoffs were modified accordingly
+                self.check_cutoffs_and_n_token(
+                    copied_cutoffs, layer, model_embed, model, model_class, -5, model_vocab_size
+                )
+
+                # Check that the model can still do a forward pass successfully (every parameter should be resized)
+                # Input ids should be clamped to the maximum size of the vocabulary
+                inputs_dict["input_ids"].clamp_(max=model_vocab_size - 5 - 1)
+                model(**inputs_dict)
+
+                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+                models_equal = True
+                for p1, p2 in zip(cloned_embeddings[layer], model_embed.emb_layers[layer].weight):
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+                self.assertTrue(models_equal)
+
+                # Reset model embeddings to original size
+                model.resize_token_embeddings(model_vocab_size, layer)
+                self.assertEqual(model_vocab_size, model.config.vocab_size)
+                self.assertEqual(model_embed.emb_layers[layer].weight.shape[0], cloned_embeddings[layer].shape[0])
+
+    def test_resize_embeddings_untied(self):
+        # transfo-xl requires special resize for lm-head
+        return
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length if idx == 0 else (min_length - 2)
+            src_len = (min_length + config.mem_len) if idx == 0 else (min_length + config.mem_len - 2)
+
+            expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+
+            # check attn size
+            self.assertListEqual(
+                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
+            )
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            seq_len = min_length if idx == 0 else min_length - 2
+            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+            # check hidden size
+            self.assertListEqual(
+                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
+                [expected_shape] * len(iter_hidden_states),
+            )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "cluster_weight") and module.cluster_weight is not None:
+            module.cluster_weight.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "cluster_bias") and module.cluster_bias is not None:
+            module.cluster_bias.data.fill_(3)
+
+        if hasattr(module, "emb_projs"):
+            for i in range(len(module.emb_projs)):
+                if module.emb_projs[i] is not None:
+                    nn.init.constant_(module.emb_projs[i], 0.0003)
+        if hasattr(module, "out_projs"):
+            for i in range(len(module.out_projs)):
+                if module.out_projs[i] is not None:
+                    nn.init.constant_(module.out_projs[i], 0.0003)
+
+        for param in ["r_emb", "r_w_bias", "r_r_bias", "r_bias"]:
+            if hasattr(module, param) and getattr(module, param) is not None:
+                weight = getattr(module, param)
+                weight.data.fill_(3)
+
+
+@require_torch
+class TransfoXLModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_transfo_xl_wt103(self):
+        model = TransfoXLLMHeadModel.from_pretrained("transfo-xl-wt103")
+        model.to(torch_device)
+        input_ids = torch.tensor(
+            [
+                [
+                    33,
+                    1297,
+                    2,
+                    1,
+                    1009,
+                    4,
+                    1109,
+                    11739,
+                    4762,
+                    358,
+                    5,
+                    25,
+                    245,
+                    22,
+                    1706,
+                    17,
+                    20098,
+                    5,
+                    3215,
+                    21,
+                    37,
+                    1110,
+                    3,
+                    13,
+                    1041,
+                    4,
+                    24,
+                    603,
+                    490,
+                    2,
+                    71477,
+                    20098,
+                    104447,
+                    2,
+                    20961,
+                    1,
+                    2604,
+                    4,
+                    1,
+                    329,
+                    3,
+                    6224,
+                    831,
+                    16002,
+                    2,
+                    8,
+                    603,
+                    78967,
+                    29546,
+                    23,
+                    803,
+                    20,
+                    25,
+                    416,
+                    5,
+                    8,
+                    232,
+                    4,
+                    277,
+                    6,
+                    1855,
+                    4601,
+                    3,
+                    29546,
+                    54,
+                    8,
+                    3609,
+                    5,
+                    57211,
+                    49,
+                    4,
+                    1,
+                    277,
+                    18,
+                    8,
+                    1755,
+                    15691,
+                    3,
+                    341,
+                    25,
+                    416,
+                    693,
+                    42573,
+                    71,
+                    17,
+                    401,
+                    94,
+                    31,
+                    17919,
+                    2,
+                    29546,
+                    7873,
+                    18,
+                    1,
+                    435,
+                    23,
+                    11011,
+                    755,
+                    5,
+                    5167,
+                    3,
+                    7983,
+                    98,
+                    84,
+                    2,
+                    29546,
+                    3267,
+                    8,
+                    3609,
+                    4,
+                    1,
+                    4865,
+                    1075,
+                    2,
+                    6087,
+                    71,
+                    6,
+                    346,
+                    8,
+                    5854,
+                    3,
+                    29546,
+                    824,
+                    1400,
+                    1868,
+                    2,
+                    19,
+                    160,
+                    2,
+                    311,
+                    8,
+                    5496,
+                    2,
+                    20920,
+                    17,
+                    25,
+                    15097,
+                    3,
+                    24,
+                    24,
+                    0,
+                ]
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+        #  In 1991 , the remains of Russian Tsar Nicholas II and his family
+        #  ( except for Alexei and Maria ) are discovered .
+        #  The voice of Nicholas's young son , Tsarevich Alexei Nikolaevich , narrates the
+        #  remainder of the story . 1883 Western Siberia ,
+        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic .
+        #  Rasputin has a vision and denounces one of the men as a horse thief . Although his
+        #  father initially slaps him for making such an accusation , Rasputin watches as the
+        #  man is chased outside and beaten . Twenty years later , Rasputin sees a vision of
+        #  the Virgin Mary , prompting him to become a priest . Rasputin quickly becomes famous ,
+        #  with people , even a bishop , begging for his blessing . <eod> </s> <eos>
+
+        expected_output_ids = [
+            33,
+            1297,
+            2,
+            1,
+            1009,
+            4,
+            1109,
+            11739,
+            4762,
+            358,
+            5,
+            25,
+            245,
+            22,
+            1706,
+            17,
+            20098,
+            5,
+            3215,
+            21,
+            37,
+            1110,
+            3,
+            13,
+            1041,
+            4,
+            24,
+            603,
+            490,
+            2,
+            71477,
+            20098,
+            104447,
+            2,
+            20961,
+            1,
+            2604,
+            4,
+            1,
+            329,
+            3,
+            6224,
+            831,
+            16002,
+            2,
+            8,
+            603,
+            78967,
+            29546,
+            23,
+            803,
+            20,
+            25,
+            416,
+            5,
+            8,
+            232,
+            4,
+            277,
+            6,
+            1855,
+            4601,
+            3,
+            29546,
+            54,
+            8,
+            3609,
+            5,
+            57211,
+            49,
+            4,
+            1,
+            277,
+            18,
+            8,
+            1755,
+            15691,
+            3,
+            341,
+            25,
+            416,
+            693,
+            42573,
+            71,
+            17,
+            401,
+            94,
+            31,
+            17919,
+            2,
+            29546,
+            7873,
+            18,
+            1,
+            435,
+            23,
+            11011,
+            755,
+            5,
+            5167,
+            3,
+            7983,
+            98,
+            84,
+            2,
+            29546,
+            3267,
+            8,
+            3609,
+            4,
+            1,
+            4865,
+            1075,
+            2,
+            6087,
+            71,
+            6,
+            346,
+            8,
+            5854,
+            3,
+            29546,
+            824,
+            1400,
+            1868,
+            2,
+            19,
+            160,
+            2,
+            311,
+            8,
+            5496,
+            2,
+            20920,
+            17,
+            25,
+            15097,
+            3,
+            24,
+            24,
+            0,
+            33,
+            1,
+            142,
+            1298,
+            188,
+            2,
+            29546,
+            113,
+            8,
+            3654,
+            4,
+            1,
+            1109,
+            7136,
+            833,
+            3,
+            13,
+            1645,
+            4,
+            29546,
+            11,
+            104,
+            7,
+            1,
+            1109,
+            532,
+            7129,
+            2,
+            10,
+            83507,
+            2,
+            1162,
+            1123,
+            2,
+            6,
+            7245,
+            10,
+            2,
+            5,
+            11,
+            104,
+            7,
+            1,
+            1109,
+            532,
+            7129,
+            2,
+            10,
+            24,
+            24,
+            10,
+            22,
+            10,
+            13,
+            770,
+            5863,
+            4,
+            7245,
+            10,
+        ]
+        #  In 1991, the remains of Russian Tsar Nicholas II and his family ( except for
+        #  Alexei and Maria ) are discovered. The voice of young son, Tsarevich Alexei
+        #  Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young
+        #  Grigori Rasputin is asked by his father and a group of men to perform magic.
+        #  Rasputin has a vision and denounces one of the men as a horse thief. Although
+        #  his father initially slaps him for making such an accusation, Rasputin watches
+        #  as the man is chased outside and beaten. Twenty years later, Rasputin sees a
+        #  vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly
+        #  becomes famous, with people, even a bishop, begging for his blessing. In the
+        #  early 20th century, Rasputin became a symbol of the Russian Orthodox Church.
+        #  The image of Rasputin was used in the Russian national anthem, " Nearer, My God,
+        #  to Heaven ", and was used in the Russian national anthem, " " ( " The Great Spirit
+        #  of Heaven "
+
+        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
+        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/test_modeling_visual_bert.py b/test_modeling_visual_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4272d776be10f1ac9b9f195fe2d0466b2af9511
--- /dev/null
+++ b/test_modeling_visual_bert.py
@@ -0,0 +1,689 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch VisualBERT model. """
+
+
+import copy
+import unittest
+
+from tests.test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        VisualBertConfig,
+        VisualBertForMultipleChoice,
+        VisualBertForPreTraining,
+        VisualBertForQuestionAnswering,
+        VisualBertForRegionToPhraseAlignment,
+        VisualBertForVisualReasoning,
+        VisualBertModel,
+    )
+    from transformers.models.visual_bert.modeling_visual_bert import VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class VisualBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        visual_seq_length=5,
+        is_training=True,
+        use_attention_mask=True,
+        use_visual_attention_mask=True,
+        use_token_type_ids=True,
+        use_visual_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        visual_embedding_dim=20,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.visual_seq_length = visual_seq_length
+        self.is_training = is_training
+        self.use_attention_mask = use_attention_mask
+        self.use_visual_attention_mask = use_visual_attention_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_visual_token_type_ids = use_visual_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.visual_embedding_dim = visual_embedding_dim
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config(self):
+        return VisualBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            visual_embedding_dim=self.visual_embedding_dim,
+            num_labels=self.num_labels,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        visual_embeds = floats_tensor([self.batch_size, self.visual_seq_length, self.visual_embedding_dim])
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = torch.ones((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
+
+        visual_attention_mask = None
+        if self.use_visual_attention_mask:
+            visual_attention_mask = torch.ones(
+                (self.batch_size, self.visual_seq_length), dtype=torch.long, device=torch_device
+            )
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        visual_token_type_ids = None
+        if self.use_visual_token_type_ids:
+            visual_token_type_ids = ids_tensor([self.batch_size, self.visual_seq_length], self.type_vocab_size)
+
+        config = self.prepare_config()
+        return config, {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask,
+            "visual_embeds": visual_embeds,
+            "visual_token_type_ids": visual_token_type_ids,
+            "visual_attention_mask": visual_attention_mask,
+        }
+
+    def prepare_config_and_inputs_for_pretraining(self):
+        masked_lm_labels = None
+        sentence_image_labels = None
+
+        if self.use_labels:
+            masked_lm_labels = ids_tensor([self.batch_size, self.seq_length + self.visual_seq_length], self.vocab_size)
+            sentence_image_labels = ids_tensor(
+                [self.batch_size],
+                self.type_sequence_label_size,
+            )
+
+        config, input_dict = self.prepare_config_and_inputs_for_common()
+
+        input_dict.update({"labels": masked_lm_labels, "sentence_image_labels": sentence_image_labels})
+
+        return config, input_dict
+
+    def prepare_config_and_inputs_for_multiple_choice(self):
+        input_ids = ids_tensor([self.batch_size, self.num_choices, self.seq_length], self.vocab_size)
+        visual_embeds = floats_tensor(
+            [self.batch_size, self.num_choices, self.visual_seq_length, self.visual_embedding_dim]
+        )
+
+        attention_mask = None
+        if self.use_attention_mask:
+            attention_mask = torch.ones(
+                (self.batch_size, self.num_choices, self.seq_length), dtype=torch.long, device=torch_device
+            )
+
+        visual_attention_mask = None
+        if self.use_visual_attention_mask:
+            visual_attention_mask = torch.ones(
+                (self.batch_size, self.num_choices, self.visual_seq_length), dtype=torch.long, device=torch_device
+            )
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.num_choices, self.seq_length], self.type_vocab_size)
+
+        visual_token_type_ids = None
+        if self.use_visual_token_type_ids:
+            visual_token_type_ids = ids_tensor(
+                [self.batch_size, self.num_choices, self.visual_seq_length], self.type_vocab_size
+            )
+
+        labels = None
+
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.prepare_config()
+        return config, {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask,
+            "visual_embeds": visual_embeds,
+            "visual_token_type_ids": visual_token_type_ids,
+            "visual_attention_mask": visual_attention_mask,
+            "labels": labels,
+        }
+
+    def prepare_config_and_inputs_for_vqa(self):
+        vqa_labels = None
+
+        if self.use_labels:
+            vqa_labels = floats_tensor([self.batch_size, self.num_labels])
+
+        config, input_dict = self.prepare_config_and_inputs_for_common()
+
+        input_dict.update({"labels": vqa_labels})
+        return config, input_dict
+
+    def prepare_config_and_inputs_for_nlvr(self):
+        nlvr_labels = None
+
+        if self.use_labels:
+            nlvr_labels = ids_tensor([self.batch_size], self.num_labels)
+
+        config, input_dict = self.prepare_config_and_inputs_for_common()
+
+        input_dict.update({"labels": nlvr_labels})
+        return config, input_dict
+
+    def prepare_config_and_inputs_for_flickr(self):
+        region_to_phrase_position = torch.cat(
+            (
+                ids_tensor([self.batch_size, self.seq_length], self.visual_seq_length),
+                torch.ones(self.batch_size, self.visual_seq_length, dtype=torch.long, device=torch_device) * -1,
+            ),
+            dim=-1,
+        )
+        flickr_labels = None
+        if self.use_labels:
+            flickr_labels = floats_tensor(
+                [self.batch_size, self.seq_length + self.visual_seq_length, self.visual_seq_length]
+            )
+
+        config, input_dict = self.prepare_config_and_inputs_for_common()
+
+        input_dict.update({"region_to_phrase_position": region_to_phrase_position, "labels": flickr_labels})
+        return config, input_dict
+
+    def create_and_check_model(self, config, input_dict):
+        model = VisualBertModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(**input_dict)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape,
+            (self.batch_size, self.seq_length + self.visual_seq_length, self.hidden_size),
+        )
+
+    def create_and_check_for_pretraining(self, config, input_dict):
+        model = VisualBertForPreTraining(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(**input_dict)
+        self.parent.assertEqual(
+            result.prediction_logits.shape,
+            (self.batch_size, self.seq_length + self.visual_seq_length, self.vocab_size),
+        )
+
+    def create_and_check_for_vqa(self, config, input_dict):
+        model = VisualBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(**input_dict)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_multiple_choice(self, config, input_dict):
+        model = VisualBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(**input_dict)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def create_and_check_for_nlvr(self, config, input_dict):
+        model = VisualBertForVisualReasoning(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(**input_dict)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_flickr(self, config, input_dict):
+        model = VisualBertForRegionToPhraseAlignment(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(**input_dict)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.seq_length + self.visual_seq_length, self.visual_seq_length)
+        )
+
+
+@require_torch
+class VisualBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            VisualBertModel,
+            VisualBertForMultipleChoice,
+            VisualBertForVisualReasoning,
+            VisualBertForRegionToPhraseAlignment,
+            VisualBertForQuestionAnswering,
+            VisualBertForPreTraining,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_torchscript = False
+    test_pruning = False
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+        if model_class == VisualBertForMultipleChoice:
+            for key in inputs_dict.keys():
+                value = inputs_dict[key]
+                if isinstance(value, torch.Tensor) and value.ndim > 1:
+                    if key != "visual_embeds":
+                        inputs_dict[key] = (
+                            inputs_dict[key].unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
+                        )
+                    else:
+                        inputs_dict[key] = (
+                            inputs_dict[key]
+                            .unsqueeze(1)
+                            .expand(-1, self.model_tester.num_choices, -1, self.model_tester.visual_embedding_dim)
+                            .contiguous()
+                        )
+
+        elif model_class == VisualBertForRegionToPhraseAlignment:
+            total_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
+            batch_size = self.model_tester.batch_size
+            inputs_dict["region_to_phrase_position"] = torch.zeros(
+                (batch_size, total_length),
+                dtype=torch.long,
+                device=torch_device,
+            )
+
+        if return_labels:
+            if model_class == VisualBertForMultipleChoice:
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+            elif model_class == VisualBertForPreTraining:
+                total_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
+                batch_size = self.model_tester.batch_size
+                inputs_dict["labels"] = torch.zeros(
+                    (batch_size, total_length),
+                    dtype=torch.long,
+                    device=torch_device,
+                )
+                inputs_dict["sentence_image_labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+            # Flickr expects float labels
+            elif model_class == VisualBertForRegionToPhraseAlignment:
+                batch_size = self.model_tester.batch_size
+                total_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
+
+                inputs_dict["labels"] = torch.ones(
+                    (
+                        batch_size,
+                        total_length,
+                        self.model_tester.visual_seq_length,
+                    ),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+
+            # VQA expects float labels
+            elif model_class == VisualBertForQuestionAnswering:
+                inputs_dict["labels"] = torch.ones(
+                    (self.model_tester.batch_size, self.model_tester.num_labels),
+                    dtype=torch.float,
+                    device=torch_device,
+                )
+
+            elif model_class == VisualBertForVisualReasoning:
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size), dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = VisualBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VisualBertConfig, hidden_size=37)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        visual_seq_len = getattr(self.model_tester, "visual_seq_length", None)
+
+        encoder_seq_length = (seq_len if seq_len is not None else 0) + (
+            visual_seq_len if visual_seq_len is not None else 0
+        )
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_model_for_vqa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_vqa()
+        self.model_tester.create_and_check_for_vqa(*config_and_inputs)
+
+    def test_model_for_nlvr(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_nlvr()
+        self.model_tester.create_and_check_for_nlvr(*config_and_inputs)
+
+    def test_model_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_multiple_choice()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_model_for_flickr(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_flickr()
+        self.model_tester.create_and_check_for_flickr(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = VisualBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class VisualBertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_vqa_coco_pre(self):
+        model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+
+        input_ids = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.long).reshape(1, -1)
+        token_type_ids = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.long).reshape(1, -1)
+        visual_embeds = torch.ones(size=(1, 10, 2048), dtype=torch.float32) * 0.5
+        visual_token_type_ids = torch.ones(size=(1, 10), dtype=torch.long)
+        attention_mask = torch.tensor([1] * 6).reshape(1, -1)
+        visual_attention_mask = torch.tensor([1] * 10).reshape(1, -1)
+
+        output = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+        )
+
+        vocab_size = 30522
+
+        expected_shape = torch.Size((1, 16, vocab_size))
+        self.assertEqual(output.prediction_logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-5.1858, -5.1903, -4.9142], [-6.2214, -5.9238, -5.8381], [-6.3027, -5.9939, -5.9297]]]
+        )
+
+        self.assertTrue(torch.allclose(output.prediction_logits[:, :3, :3], expected_slice, atol=1e-4))
+
+        expected_shape_2 = torch.Size((1, 2))
+        self.assertEqual(output.seq_relationship_logits.shape, expected_shape_2)
+
+        expected_slice_2 = torch.tensor([[0.7393, 0.1754]])
+
+        self.assertTrue(torch.allclose(output.seq_relationship_logits, expected_slice_2, atol=1e-4))
+
+    @slow
+    def test_inference_vqa(self):
+        model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
+
+        input_ids = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.long).reshape(1, -1)
+        token_type_ids = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.long).reshape(1, -1)
+        visual_embeds = torch.ones(size=(1, 10, 2048), dtype=torch.float32) * 0.5
+        visual_token_type_ids = torch.ones(size=(1, 10), dtype=torch.long)
+        attention_mask = torch.tensor([1] * 6).reshape(1, -1)
+        visual_attention_mask = torch.tensor([1] * 10).reshape(1, -1)
+
+        output = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+        )
+
+        # vocab_size = 30522
+
+        expected_shape = torch.Size((1, 3129))
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-8.9898, 3.0803, -1.8016, 2.4542, -8.3420, -2.0224, -3.3124, -4.4139, -3.1491, -3.8997]]
+        )
+
+        self.assertTrue(torch.allclose(output.logits[:, :10], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_nlvr(self):
+        model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
+
+        input_ids = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.long).reshape(1, -1)
+        token_type_ids = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.long).reshape(1, -1)
+        visual_embeds = torch.ones(size=(1, 10, 1024), dtype=torch.float32) * 0.5
+        visual_token_type_ids = torch.ones(size=(1, 10), dtype=torch.long)
+        attention_mask = torch.tensor([1] * 6).reshape(1, -1)
+        visual_attention_mask = torch.tensor([1] * 10).reshape(1, -1)
+
+        output = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+        )
+
+        # vocab_size = 30522
+
+        expected_shape = torch.Size((1, 2))
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([[-1.1436, 0.8900]])
+
+        self.assertTrue(torch.allclose(output.logits, expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_vcr(self):
+        model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
+
+        input_ids = torch.tensor([[[1, 2, 3, 4, 5, 6] for i in range(4)]], dtype=torch.long)
+        attention_mask = torch.ones_like(input_ids)
+        token_type_ids = torch.ones_like(input_ids)
+
+        visual_embeds = torch.ones(size=(1, 4, 10, 512), dtype=torch.float32) * 0.5
+        visual_token_type_ids = torch.ones(size=(1, 4, 10), dtype=torch.long)
+        visual_attention_mask = torch.ones_like(visual_token_type_ids)
+
+        output = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+        )
+
+        # vocab_size = 30522
+
+        expected_shape = torch.Size((1, 4))
+        self.assertEqual(output.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([[-7.7697, -7.7697, -7.7697, -7.7697]])
+
+        self.assertTrue(torch.allclose(output.logits, expected_slice, atol=1e-4))
diff --git a/test_modeling_vit.py b/test_modeling_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45c12c16d3d3dfb0942166ff8fecf820665837c
--- /dev/null
+++ b/test_modeling_vit.py
@@ -0,0 +1,353 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch ViT model. """
+
+
+import inspect
+import unittest
+
+from transformers.file_utils import cached_property, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import ViTConfig, ViTForImageClassification, ViTModel
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTFeatureExtractor
+
+
+class ViTModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = ViTConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, pixel_values, labels
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = ViTModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = to_2tuple(self.image_size)
+        patch_size = to_2tuple(self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = ViTForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class ViTModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (
+        (
+            ViTModel,
+            ViTForImageClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = ViTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_inputs_embeds(self):
+        # ViT does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = to_2tuple(self.model_tester.image_size)
+        patch_size = to_2tuple(self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # ViT has a different seq_length
+            image_size = to_2tuple(self.model_tester.image_size)
+            patch_size = to_2tuple(self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ViTModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_vision
+class ViTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+
+    @slow
+    def test_inference_image_classification_head(self):
+        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/test_modeling_wav2vec2.py b/test_modeling_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9fa91a47682d20fca95ca4cfdcf63c1285dba0f
--- /dev/null
+++ b/test_modeling_wav2vec2.py
@@ -0,0 +1,839 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Wav2Vec2 model. """
+
+
+import math
+import unittest
+
+from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
+from transformers import is_torch_available
+from transformers.testing_utils import require_datasets, require_soundfile, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, _config_zero_init
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Wav2Vec2Config,
+        Wav2Vec2FeatureExtractor,
+        Wav2Vec2ForCTC,
+        Wav2Vec2ForMaskedLM,
+        Wav2Vec2ForPreTraining,
+        Wav2Vec2Model,
+        Wav2Vec2Processor,
+    )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2GumbelVectorQuantizer, _compute_mask_indices
+
+
+class Wav2Vec2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=1024,  # speech is longer
+        is_training=False,
+        hidden_size=16,
+        feat_extract_norm="group",
+        feat_extract_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(32, 32, 32),
+        conv_stride=(4, 4, 4),
+        conv_kernel=(8, 8, 8),
+        conv_bias=False,
+        num_conv_pos_embeddings=16,
+        num_conv_pos_embedding_groups=2,
+        num_hidden_layers=4,
+        num_attention_heads=2,
+        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
+        intermediate_size=20,
+        layer_norm_eps=1e-5,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        vocab_size=32,
+        do_stable_layer_norm=False,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_dropout = feat_extract_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = conv_dim
+        self.conv_stride = conv_stride
+        self.conv_kernel = conv_kernel
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.intermediate_size = intermediate_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.scope = scope
+
+        output_seq_length = self.seq_length
+        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
+            output_seq_length = (output_seq_length - (kernel - 1)) / stride
+        self.output_seq_length = int(math.ceil(output_seq_length))
+        self.encoder_seq_length = self.output_seq_length
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = Wav2Vec2Config(
+            hidden_size=self.hidden_size,
+            feat_extract_norm=self.feat_extract_norm,
+            feat_extract_dropout=self.feat_extract_dropout,
+            feat_extract_activation=self.feat_extract_activation,
+            conv_dim=self.conv_dim,
+            conv_stride=self.conv_stride,
+            conv_kernel=self.conv_kernel,
+            conv_bias=self.conv_bias,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            intermediate_size=self.intermediate_size,
+            layer_norm_eps=self.layer_norm_eps,
+            hidden_act=self.hidden_act,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+        )
+
+        return config, input_values, attention_mask
+
+    def create_and_check_model(self, config, input_values, attention_mask):
+        model = Wav2Vec2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values, attention_mask=attention_mask)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
+        )
+
+    def create_and_check_batch_inference(self, config, input_values, *args):
+        # test does not pass for models making use of `group_norm`
+        # check: https://github.com/pytorch/fairseq/issues/3227
+        model = Wav2Vec2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0.0
+
+        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
+
+        for i in range(input_values.shape[0]):
+            input_slice = input_values[i : i + 1, : input_lengths[i]]
+            output = model(input_slice).last_hidden_state
+
+            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
+            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
+
+    def check_ctc_loss(self, config, input_values, *args):
+        model = Wav2Vec2ForCTC(config=config)
+        model.to(torch_device)
+
+        # make sure that dropout is disabled
+        model.eval()
+
+        input_values = input_values[:3]
+        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+            attention_mask[i, input_lengths[i] :] = 0
+
+        model.config.ctc_loss_reduction = "sum"
+        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
+
+        model.config.ctc_loss_reduction = "mean"
+        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss
+
+        self.parent.assertTrue(abs(labels.shape[0] * labels.shape[1] * mean_loss.item() - sum_loss.item()) < 1e-3)
+
+    def check_training(self, config, input_values, *args):
+        config.ctc_zero_infinity = True
+        model = Wav2Vec2ForCTC(config=config)
+        model.to(torch_device)
+        model.train()
+
+        # freeze feature encoder
+        model.freeze_feature_extractor()
+
+        input_values = input_values[:3]
+
+        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
+        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
+        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
+
+        # pad input
+        for i in range(len(input_lengths)):
+            input_values[i, input_lengths[i] :] = 0.0
+
+            if max_length_labels[i] < labels.shape[-1]:
+                # it's important that we make sure that target lenghts are at least
+                # one shorter than logit lenghts to prevent -inf
+                labels[i, max_length_labels[i] - 1 :] = -100
+
+        loss = model(input_values, labels=labels).loss
+        self.parent.assertFalse(torch.isinf(loss).item())
+
+        loss.backward()
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_values, attention_mask = self.prepare_config_and_inputs()
+        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForPreTraining) if is_torch_available() else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Wav2Vec2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_training(*config_and_inputs)
+
+    # Wav2Vec2 has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Wav2Vec2 cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Wav2Vec2 has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForPreTraining) if is_torch_available() else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = Wav2Vec2ModelTester(
+            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
+        )
+        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_batched_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
+
+    def test_ctc_loss_inference(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_ctc_loss(*config_and_inputs)
+
+    def test_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_training(*config_and_inputs)
+
+    # Wav2Vec2 has no inputs_embeds
+    def test_inputs_embeds(self):
+        pass
+
+    # `input_ids` is renamed to `input_values`
+    def test_forward_signature(self):
+        pass
+
+    # Wav2Vec2 cannot resize token embeddings
+    # since it has no tokens embeddings
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    # Wav2Vec2 has no inputs_embeds
+    # and thus the `get_input_embeddings` fn
+    # is not implemented
+    def test_model_common_attributes(self):
+        pass
+
+    def test_retain_grad_hidden_states_attentions(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        # no need to test all models as different heads yield the same functionality
+        model_class = self.all_model_classes[0]
+        model = model_class(config)
+        model.to(torch_device)
+
+        # set layer drop to 0
+        model.config.layerdrop = 0.0
+
+        input_values = inputs_dict["input_values"]
+
+        input_lengths = torch.tensor(
+            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device
+        )
+        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
+
+        labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
+        inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"])
+        inputs_dict["labels"] = labels
+
+        outputs = model(**inputs_dict)
+
+        output = outputs[0]
+
+        # Encoder-/Decoder-only models
+        hidden_states = outputs.hidden_states[0]
+        attentions = outputs.attentions[0]
+
+        hidden_states.retain_grad()
+        attentions.retain_grad()
+
+        output.flatten()[0].backward(retain_graph=True)
+
+        self.assertIsNotNone(hidden_states.grad)
+        self.assertIsNotNone(attentions.grad)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv.weight",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                ]
+                if param.requires_grad:
+                    if any([x in name for x in uniform_init_parms]):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "weight_g") and module.weight_g is not None:
+            module.weight_g.data.fill_(3)
+        if hasattr(module, "weight_v") and module.weight_v is not None:
+            module.weight_v.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+        if hasattr(module, "codevectors") and module.codevectors is not None:
+            module.codevectors.data.fill_(3)
+        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
+            module.masked_spec_embed.data.fill_(3)
+
+    def test_model_for_pretraining(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = Wav2Vec2ForPreTraining(config).to(torch_device)
+
+        features_shape = (
+            inputs_dict["input_values"].shape[0],
+            model._get_feat_extract_output_lengths(torch.tensor(inputs_dict["input_values"].shape[1])),
+        )
+
+        mask_time_indices = _compute_mask_indices(
+            features_shape,
+            model.config.mask_time_prob,
+            model.config.mask_time_length,
+            device=inputs_dict["input_values"].device,
+            min_masks=2,
+        ).to(torch_device)
+
+        loss = model(
+            inputs_dict["input_values"],
+            attention_mask=inputs_dict["attention_mask"],
+            mask_time_indices=mask_time_indices,
+        ).loss
+
+        mask_time_indices[:, : mask_time_indices.shape[-1] // 2] = True
+        loss_more_masked = model(
+            inputs_dict["input_values"],
+            attention_mask=inputs_dict["attention_mask"],
+            mask_time_indices=mask_time_indices,
+        ).loss
+
+        # loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted
+        self.assertTrue(loss.detach().item() <= loss_more_masked.detach().item())
+
+    @slow
+    def test_model_from_pretrained(self):
+        model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class Wav2Vec2UtilsTest(unittest.TestCase):
+    def test_compute_mask_indices(self):
+        batch_size = 4
+        sequence_length = 60
+        mask_prob = 0.5
+        mask_length = 1
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length, torch_device)
+
+        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+
+    def test_compute_mask_indices_overlap(self):
+        batch_size = 4
+        sequence_length = 80
+        mask_prob = 0.5
+        mask_length = 4
+
+        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length, torch_device)
+
+        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
+        for batch_sum in mask.sum(axis=-1):
+            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
+
+    def test_compute_perplexity(self):
+        probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
+        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
+
+        # mask half of the input
+        mask = torch.ones((2,), device=torch_device, dtype=torch.bool)
+        mask[0] = 0
+
+        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
+        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
+
+    def test_sample_negatives(self):
+        batch_size = 2
+        sequence_length = 10
+        hidden_size = 4
+        num_negatives = 3
+
+        features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
+            sequence_length, hidden_size
+        )  # each value in vector consits of same value
+        features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
+
+        negatives = Wav2Vec2ForPreTraining._sample_negatives(features, num_negatives)
+
+        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
+
+        # make sure no negatively sampled vector is actually a positive one
+        for negative in negatives:
+            self.assertTrue(((negative - features) == 0).sum() == 0.0)
+
+        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
+        self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
+
+
+@require_torch
+@require_datasets
+@require_soundfile
+@slow
+class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        import soundfile as sf
+
+        ids = [f"1272-141231-000{i}" for i in range(num_samples)]
+
+        # map files to raw
+        def map_to_array(batch):
+            speech, _ = sf.read(batch["file"])
+            batch["speech"] = speech
+            return batch
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+
+        ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array)
+
+        return ds["speech"][:num_samples]
+
+    def test_inference_ctc_normal(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
+        input_speech = self._load_datasamples(1)
+
+        input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_normal_batched(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        model.to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
+
+        input_speech = self._load_datasamples(2)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True)
+
+        input_values = inputs.input_values.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_ctc_robust_batched(self):
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(torch_device)
+        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True)
+
+        input_values = inputs.input_values.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        with torch.no_grad():
+            logits = model(input_values, attention_mask=attention_mask).logits
+
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_trans = processor.batch_decode(predicted_ids)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "a man said to the universe sir i exist",
+            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
+            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
+            "his instant panic was followed by a small sharp blow high on his chest",
+        ]
+        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
+
+    def test_inference_integration(self):
+        model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
+        model.to(torch_device)
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base", return_attention_mask=True
+        )
+        input_speech = self._load_datasamples(2)
+
+        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)
+
+        features_shape = (
+            inputs_dict["input_values"].shape[0],
+            model._get_feat_extract_output_lengths(torch.tensor(inputs_dict["input_values"].shape[1])),
+        )
+
+        torch.manual_seed(0)
+        mask_time_indices = _compute_mask_indices(
+            features_shape,
+            model.config.mask_time_prob,
+            model.config.mask_time_length,
+            device=inputs_dict["input_values"].device,
+            min_masks=2,
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(
+                inputs_dict.input_values.to(torch_device),
+                attention_mask=inputs_dict.attention_mask.to(torch_device),
+                mask_time_indices=mask_time_indices,
+            )
+
+        # compute cosine similarity
+        cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        # retrieve cosine sim of masked features
+        cosine_sim_masked = cosine_sim[mask_time_indices]
+
+        # fmt: off
+        expected_cosine_sim_masked = torch.tensor(
+            [0.7458, 0.7188, 0.6418, 0.3729, 0.3741, 0.3694, 0.3110, 0.2257, 0.4403, 0.5415, 0.3950, 0.3701, 0.8831, 0.8613, 0.5229, 0.6696, 0.7206, 0.7877, 0.6758, 0.8746, 0.6596, 0.6282, 0.6178, 0.5839, 0.5926, 0.6651, 0.4635, 0.6332, 0.6572, 0.8776, 0.4999, 0.7001, 0.7257, 0.5098, 0.6229, 0.4566, 0.5261, 0.6363, 0.5371, 0.6997],
+            device=torch_device,
+        )
+        # fmt: on
+
+        self.assertTrue(torch.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
+
+    def test_inference_pretrained(self):
+        model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
+        model.to(torch_device)
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base", return_attention_mask=True
+        )
+        input_speech = self._load_datasamples(2)
+
+        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)
+
+        features_shape = (
+            inputs_dict["input_values"].shape[0],
+            model._get_feat_extract_output_lengths(torch.tensor(inputs_dict["input_values"].shape[1])),
+        )
+
+        torch.manual_seed(0)
+        mask_time_indices = _compute_mask_indices(
+            features_shape,
+            model.config.mask_time_prob,
+            model.config.mask_time_length,
+            device=inputs_dict["input_values"].device,
+            min_masks=2,
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(
+                inputs_dict.input_values.to(torch_device),
+                attention_mask=inputs_dict.attention_mask.to(torch_device),
+                mask_time_indices=mask_time_indices,
+            )
+
+        # compute cosine similarity
+        cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        # retrieve cosine sim of masked features
+        cosine_sim_masked = cosine_sim[mask_time_indices]
+
+        # ... now compare to randomly initialized model
+
+        config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base")
+        model_rand = Wav2Vec2ForPreTraining(config).to(torch_device).eval()
+
+        with torch.no_grad():
+            outputs_rand = model_rand(
+                inputs_dict.input_values.to(torch_device),
+                attention_mask=inputs_dict.attention_mask.to(torch_device),
+                mask_time_indices=mask_time_indices,
+            )
+
+        # compute cosine similarity
+        cosine_sim_rand = torch.cosine_similarity(
+            outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1
+        )
+
+        # retrieve cosine sim of masked features
+        cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]
+
+        # a pretrained wav2vec2 model has learned to predict the quantized latent states
+        # => the cosine similarity between quantized states and predicted states > 0.5
+        # a random wav2vec2 model has not learned to predict the quantized latent states
+        # => the cosine similarity between quantized states and predicted states is very likely < 0.1
+        self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
+
+    @unittest.skipIf(torch_device != "cpu", "cannot make deterministic on GPU")
+    def test_loss_pretraining(self):
+        model = Wav2Vec2ForPreTraining.from_pretrained(
+            "facebook/wav2vec2-base",
+            attention_dropout=0.0,
+            feat_proj_dropout=0.0,
+            hidden_dropout=0.0,
+            layerdrop=0.0,
+        )
+        model.to(torch_device).train()
+
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base", return_attention_mask=True
+        )
+        input_speech = self._load_datasamples(2)
+
+        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)
+
+        features_shape = (
+            inputs_dict["input_values"].shape[0],
+            model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]),
+        )
+
+        torch.manual_seed(0)
+        mask_time_indices = _compute_mask_indices(
+            features_shape,
+            model.config.mask_time_prob,
+            model.config.mask_time_length,
+            device=inputs_dict["input_values"].device,
+            min_masks=2,
+        ).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(
+                inputs_dict.input_values.to(torch_device),
+                attention_mask=inputs_dict.attention_mask.to(torch_device),
+                mask_time_indices=mask_time_indices,
+            )
+
+        # check diversity loss
+        num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
+        diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
+        self.assertTrue(abs(diversity_loss.item() - 0.8859) < 1e-3)
+
+        # check overall loss (contrastive loss + diversity loss)
+        expected_loss = 62.5170
+
+        self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
diff --git a/test_modeling_xlm.py b/test_modeling_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..691a4039ea93c28af0a8434e755c51d0bc4e95c8
--- /dev/null
+++ b/test_modeling_xlm.py
@@ -0,0 +1,488 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        XLMConfig,
+        XLMForMultipleChoice,
+        XLMForQuestionAnswering,
+        XLMForQuestionAnsweringSimple,
+        XLMForSequenceClassification,
+        XLMForTokenClassification,
+        XLMModel,
+        XLMWithLMHeadModel,
+    )
+    from transformers.models.xlm.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class XLMModelTester:
+    def __init__(
+        self,
+        parent,
+    ):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_lengths = True
+        self.use_token_type_ids = True
+        self.use_labels = True
+        self.gelu_activation = True
+        self.sinusoidal_embeddings = False
+        self.causal = False
+        self.asm = False
+        self.n_langs = 2
+        self.vocab_size = 99
+        self.n_special = 0
+        self.hidden_size = 32
+        self.num_hidden_layers = 5
+        self.num_attention_heads = 4
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 2
+        self.num_choices = 4
+        self.summary_type = "last"
+        self.use_proj = True
+        self.scope = None
+        self.bos_token_id = 0
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        input_lengths = None
+        if self.use_input_lengths:
+            input_lengths = (
+                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
+            )  # small variation of seq_length
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
+
+        sequence_labels = None
+        token_labels = None
+        is_impossible_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = XLMConfig(
+            vocab_size=self.vocab_size,
+            n_special=self.n_special,
+            emb_dim=self.hidden_size,
+            n_layers=self.num_hidden_layers,
+            n_heads=self.num_attention_heads,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            gelu_activation=self.gelu_activation,
+            sinusoidal_embeddings=self.sinusoidal_embeddings,
+            asm=self.asm,
+            causal=self.causal,
+            n_langs=self.n_langs,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            summary_type=self.summary_type,
+            use_proj=self.use_proj,
+            num_labels=self.num_labels,
+            bos_token_id=self.bos_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        )
+
+    def create_and_check_xlm_model(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = XLMModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, lengths=input_lengths, langs=token_type_ids)
+        result = model(input_ids, langs=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_xlm_lm_head(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = XLMWithLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_xlm_simple_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = XLMForQuestionAnsweringSimple(config)
+        model.to(torch_device)
+        model.eval()
+
+        outputs = model(input_ids)
+
+        outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
+        result = outputs
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_xlm_qa(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = XLMForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+
+        result_with_labels = model(
+            input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+            p_mask=input_mask,
+        )
+
+        result_with_labels = model(
+            input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+        )
+
+        (total_loss,) = result_with_labels.to_tuple()
+
+        result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
+
+        (total_loss,) = result_with_labels.to_tuple()
+
+        self.parent.assertEqual(result_with_labels.loss.shape, ())
+        self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top))
+        self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top))
+        self.parent.assertEqual(
+            result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
+        )
+        self.parent.assertEqual(
+            result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
+        )
+        self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,))
+
+    def create_and_check_xlm_sequence_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        model = XLMForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids)
+        result = model(input_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+    def create_and_check_xlm_token_classif(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = XLMForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_xlm_for_multiple_choice(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        choice_labels,
+        input_mask,
+    ):
+        config.num_choices = self.num_choices
+        model = XLMForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_lengths,
+            sequence_labels,
+            token_labels,
+            is_impossible_labels,
+            choice_labels,
+            input_mask,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
+        return config, inputs_dict
+
+
+@require_torch
+class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            XLMModel,
+            XLMWithLMHeadModel,
+            XLMForQuestionAnswering,
+            XLMForSequenceClassification,
+            XLMForQuestionAnsweringSimple,
+            XLMForTokenClassification,
+            XLMForMultipleChoice,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (XLMWithLMHeadModel,) if is_torch_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
+    test_sequence_classification_problem_types = True
+
+    # XLM has 2 QA models -> need to manually set the correct labels for one of them here
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "XLMForQuestionAnswering":
+                inputs_dict["start_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = XLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlm_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
+
+    def test_xlm_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
+
+    def test_xlm_simple_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs)
+
+    def test_xlm_qa(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
+
+    def test_xlm_sequence_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
+
+    def test_xlm_token_classif(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_token_classif(*config_and_inputs)
+
+    def test_xlm_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_attentions in enumerate(attentions):
+            # adds PAD dummy token
+            tgt_len = min_length + idx + 1
+            src_len = min_length + idx + 1
+
+            expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+            # check attn size
+            self.assertListEqual(
+                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
+            )
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            # adds PAD dummy token
+            seq_len = min_length + idx + 1
+            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+            # check hidden size
+            self.assertListEqual(
+                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
+                [expected_shape] * len(iter_hidden_states),
+            )
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = XLMModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class XLMModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_xlm_mlm_en_2048(self):
+        model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
+        model.to(torch_device)
+        input_ids = torch.tensor([[14, 447]], dtype=torch.long, device=torch_device)  # the president
+        expected_output_ids = [
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+            14,
+            447,
+        ]  # the president the president the president the president the president the president the president the president the president the president
+        # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
+        output_ids = model.generate(input_ids, do_sample=False)
+        self.assertListEqual(output_ids[0].cpu().numpy().tolist(), expected_output_ids)
diff --git a/test_modeling_xlm_prophetnet.py b/test_modeling_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e8502b9bd5ac9016cac41b304aa4b08a2dc473
--- /dev/null
+++ b/test_modeling_xlm_prophetnet.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import XLMProphetNetForConditionalGeneration, XLMProphetNetTokenizer
+
+
+@require_torch
+class XLMProphetNetModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_pretrained_checkpoint_hidden_states(self):
+        model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+        model.to(torch_device)
+
+        # encoder-decoder outputs
+        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
+        decoder_prev_ids = torch.tensor(
+            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
+        ).to(torch_device)
+        output = model(
+            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
+        )
+        output_predited_logis = output[0]
+        expected_shape = torch.Size((1, 14, 250012))
+        self.assertEqual(output_predited_logis.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[[-6.6042, -8.3838, 12.4717], [-6.4426, -8.1994, 12.4542], [-6.0851, -7.8209, 12.9493]]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
+
+        # encoder outputs
+        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
+        expected_encoder_outputs_slice = torch.tensor(
+            [[[-1.4260, -0.7628, 0.8453], [-1.4719, -0.1391, 0.7807], [-1.7678, 0.0114, 0.4646]]]
+        ).to(torch_device)
+        expected_shape_encoder = torch.Size((1, 4, 1024))
+        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
+        self.assertTrue(torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
+
+        # decoder outputs
+        decoder_outputs = model.prophetnet.decoder(
+            decoder_prev_ids,
+            encoder_hidden_states=encoder_outputs,
+        )
+        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 14, -1)
+        predicting_streams_logits = model.lm_head(predicting_streams)
+        next_first_stream_logits = predicting_streams_logits[:, 0]
+        self.assertTrue(torch.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_ntg_hidden_states(self):
+        model = XLMProphetNetForConditionalGeneration.from_pretrained(
+            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
+        )
+        model.to(torch_device)
+
+        encoder_ids = torch.tensor([[17, 96208, 103471, 2]]).to(torch_device)
+        decoder_prev_ids = torch.tensor(
+            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
+        ).to(torch_device)
+        output = model(
+            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
+        )
+        output_predited_logis = output[0]
+        expected_shape = torch.Size((1, 14, 250012))
+        self.assertEqual(output_predited_logis.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = torch.tensor(
+            [[[-8.8815, -9.2996, -4.4506], [-6.7202, -7.8944, -0.9402], [-8.6890, -7.4528, -1.9437]]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_xprophetnet_ntg_inference(self):
+        model = XLMProphetNetForConditionalGeneration.from_pretrained(
+            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
+        )
+        model.to(torch_device)
+        model.config.max_length = 512
+
+        tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
+
+        EN_SENTENCE = "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after January 14, 2020, according to the official portal of the organization. From that day, users of this system will not be able to receive security updates, which could make their computers vulnerable to cyber attacks."
+        RU_SENTENCE = "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7 после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми к кибератакам."
+        ZH_SENTENCE = (
+            "根据该组织的官方门户网站，微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起，该系统的用户将无法接收安全更新，这可能会使他们的计算机容易受到网络攻击。"
+        )
+
+        input_ids = tokenizer(
+            [EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=255, return_tensors="pt"
+        ).input_ids
+        input_ids = input_ids.to(torch_device)
+
+        summary_ids = model.generate(
+            input_ids, num_beams=10, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        generated_titles = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
+        EXPECTED_TITLE_EN = "Microsoft to end Windows 7 free support after January 14, 2020"
+        EXPECTED_TITLE_RU = "Microsoft намерена прекратить бесплатную поддержку Windows 7 после 14 января 2020 года"
+        EXPECTED_TITLE_ZH = "微软打算终止对Windows 7操作系统的免费支持"
+        self.assertListEqual(
+            [EXPECTED_TITLE_EN, EXPECTED_TITLE_RU, EXPECTED_TITLE_ZH],
+            generated_titles,
+        )
+
+        summary_ids_beam1 = model.generate(
+            input_ids, num_beams=1, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
+        )
+        generated_titles_beam1_tok = [
+            tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
+        ]
+        EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
+        EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
+            " "
+        )
+        EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
+        self.assertListEqual(
+            [EXPECTED_TITLE_EN_BEAM1_TOK, EXPECTED_TITLE_RU_BEAM1_TOK, EXPECTED_TITLE_ZH_BEAM1_TOK],
+            generated_titles_beam1_tok,
+        )
diff --git a/test_modeling_xlm_roberta.py b/test_modeling_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..35ce2bd88185d6e752791bf74cbb4a4d39304ead
--- /dev/null
+++ b/test_modeling_xlm_roberta.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import XLMRobertaModel
+
+
+@require_sentencepiece
+@require_tokenizers
+@require_torch
+class XLMRobertaModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_xlm_roberta_base(self):
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
+
+        expected_output_shape = torch.Size((1, 12, 768))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]]
+        )
+        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
+        #  xlmr.eval()
+        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
+
+        output = model(input_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
+
+    @slow
+    def test_xlm_roberta_large(self):
+        model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
+        input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
+        # The dog is cute and lives in the garden house
+
+        expected_output_shape = torch.Size((1, 12, 1024))  # batch_size, sequence_length, embedding_vector_dim
+        expected_output_values_last_dim = torch.tensor(
+            [[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]]
+        )
+        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
+        #  xlmr.eval()
+        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
+
+        output = model(input_ids)["last_hidden_state"].detach()
+        self.assertEqual(output.shape, expected_output_shape)
+        # compare the actual values for a slice of last dim
+        self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
diff --git a/test_modeling_xlnet.py b/test_modeling_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ab4940689ece9c62a327ed27f84333b4bce817f
--- /dev/null
+++ b/test_modeling_xlnet.py
@@ -0,0 +1,1070 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        XLNetConfig,
+        XLNetForMultipleChoice,
+        XLNetForQuestionAnswering,
+        XLNetForQuestionAnsweringSimple,
+        XLNetForSequenceClassification,
+        XLNetForTokenClassification,
+        XLNetLMHeadModel,
+        XLNetModel,
+    )
+    from transformers.models.xlnet.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class XLNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        mem_len=10,
+        clamp_len=-1,
+        reuse_len=15,
+        is_training=True,
+        use_labels=True,
+        vocab_size=99,
+        cutoffs=[10, 50, 80],
+        hidden_size=32,
+        num_attention_heads=4,
+        d_inner=128,
+        num_hidden_layers=5,
+        type_sequence_label_size=2,
+        untie_r=True,
+        bi_data=False,
+        same_length=False,
+        initializer_range=0.05,
+        seed=1,
+        type_vocab_size=2,
+        bos_token_id=1,
+        eos_token_id=2,
+        pad_token_id=5,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = 14
+        self.seq_length = 7
+        self.mem_len = 10
+        # self.key_len = seq_length + mem_len
+        self.clamp_len = -1
+        self.reuse_len = 15
+        self.is_training = True
+        self.use_labels = True
+        self.vocab_size = 99
+        self.cutoffs = [10, 50, 80]
+        self.hidden_size = 32
+        self.num_attention_heads = 4
+        self.d_inner = 128
+        self.num_hidden_layers = 5
+        self.type_sequence_label_size = 2
+        self.untie_r = True
+        self.bi_data = False
+        self.same_length = False
+        self.initializer_range = 0.05
+        self.seed = 1
+        self.type_vocab_size = 2
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.pad_token_id = 5
+        self.num_choices = 4
+
+    def prepare_config_and_inputs(self):
+        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
+        perm_mask = torch.zeros(
+            self.batch_size,
+            self.seq_length + 1,
+            self.seq_length + 1,
+            dtype=torch.float,
+            device=torch_device,
+        )
+        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        target_mapping = torch.zeros(
+            self.batch_size,
+            1,
+            self.seq_length + 1,
+            dtype=torch.float,
+            device=torch_device,
+        )
+        target_mapping[:, 0, -1] = 1.0  # predict last token
+
+        sequence_labels = None
+        lm_labels = None
+        is_impossible_labels = None
+        token_labels = None
+        if self.use_labels:
+            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        config = XLNetConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            n_head=self.num_attention_heads,
+            d_inner=self.d_inner,
+            n_layer=self.num_hidden_layers,
+            untie_r=self.untie_r,
+            mem_len=self.mem_len,
+            clamp_len=self.clamp_len,
+            same_length=self.same_length,
+            reuse_len=self.reuse_len,
+            bi_data=self.bi_data,
+            initializer_range=self.initializer_range,
+            num_labels=self.type_sequence_label_size,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+
+        return (
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        )
+
+    def set_seed(self):
+        random.seed(self.seed)
+        torch.manual_seed(self.seed)
+
+    def create_and_check_xlnet_base_model(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids_1, input_mask=input_mask)
+        result = model(input_ids_1, attention_mask=input_mask)
+        result = model(input_ids_1, token_type_ids=segment_ids)
+        result = model(input_ids_1)
+
+        config.mem_len = 0
+        model = XLNetModel(config)
+        model.to(torch_device)
+        model.eval()
+        base_model_output = model(input_ids_1)
+        self.parent.assertEqual(len(base_model_output), 2)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_use_mems_train(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetForSequenceClassification(config)
+        model.to(torch_device)
+        model.train()
+
+        train_size = input_ids_1.shape[0]
+
+        batch_size = 4
+        for i in range(train_size // batch_size + 1):
+            input_ids = input_ids_1[i : (i + 1) * batch_size]
+            labels = sequence_labels[i : (i + 1) * batch_size]
+            outputs = model(input_ids=input_ids, labels=labels, return_dict=True)
+            self.parent.assertIsNone(outputs.mems)
+            self.parent.assertIsNotNone(outputs.loss)
+
+    def create_and_check_xlnet_model_use_mems(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        causal_mask = torch.ones(
+            input_ids_1.shape[0],
+            input_ids_1.shape[1],
+            input_ids_1.shape[1],
+            dtype=torch.float,
+            device=torch_device,
+        )
+        causal_mask = torch.triu(causal_mask, diagonal=0)
+        outputs_cache = model(input_ids_1, use_mems=True, perm_mask=causal_mask)
+        outputs_no_cache = model(input_ids_1, use_mems=False, perm_mask=causal_mask)
+        outputs_conf = model(input_ids_1)
+
+        self.parent.assertTrue(len(outputs_cache) == len(outputs_conf))
+        self.parent.assertTrue(len(outputs_cache) == len(outputs_no_cache) + 1)
+
+        output, mems = outputs_cache.to_tuple()
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids_1, next_tokens], dim=-1)
+
+        # causal mask
+        causal_mask = torch.ones(
+            input_ids_1.shape[0],
+            input_ids_1.shape[1] + 1,
+            input_ids_1.shape[1] + 1,
+            dtype=torch.float,
+            device=torch_device,
+        )
+        causal_mask = torch.triu(causal_mask, diagonal=0)
+        single_mask = torch.ones(input_ids_1.shape[0], 1, 1, dtype=torch.float, device=torch_device)
+
+        # second forward pass
+        output_from_no_past = model(next_input_ids, perm_mask=causal_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, mems=mems, perm_mask=single_mask)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_xlnet_base_model_with_att_output(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        attentions = model(input_ids_1, target_mapping=target_mapping, output_attentions=True)["attentions"]
+
+        self.parent.assertEqual(len(attentions), config.n_layer)
+        self.parent.assertIsInstance(attentions[0], tuple)
+        self.parent.assertEqual(len(attentions[0]), 2)
+        self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
+
+    def create_and_check_xlnet_lm_head(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetLMHeadModel(config)
+        model.to(torch_device)
+        model.eval()
+
+        result1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
+
+        result2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=result1.mems)
+
+        _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
+
+        self.parent.assertEqual(result1.loss.shape, ())
+        self.parent.assertEqual(result1.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result1.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+        self.parent.assertEqual(result2.loss.shape, ())
+        self.parent.assertEqual(result2.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result2.mems],
+            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_qa(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetForQuestionAnswering(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids_1)
+
+        result_with_labels = model(
+            input_ids_1,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+            p_mask=input_mask,
+        )
+
+        result_with_labels = model(
+            input_ids_1,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+            cls_index=sequence_labels,
+            is_impossible=is_impossible_labels,
+        )
+
+        total_loss, mems = result_with_labels.to_tuple()
+
+        result_with_labels = model(
+            input_ids_1,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+
+        total_loss, mems = result_with_labels.to_tuple()
+
+        self.parent.assertEqual(result_with_labels.loss.shape, ())
+        self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top))
+        self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top))
+        self.parent.assertEqual(
+            result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
+        )
+        self.parent.assertEqual(
+            result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
+        )
+        self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_token_classif(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetForTokenClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids_1)
+        result = model(input_ids_1, labels=token_labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.type_sequence_label_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def create_and_check_xlnet_sequence_classif(
+        self,
+        config,
+        input_ids_1,
+        input_ids_2,
+        input_ids_q,
+        perm_mask,
+        input_mask,
+        target_mapping,
+        segment_ids,
+        lm_labels,
+        sequence_labels,
+        is_impossible_labels,
+        token_labels,
+    ):
+        model = XLNetForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        result = model(input_ids_1)
+        result = model(input_ids_1, labels=sequence_labels)
+
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        self.parent.assertListEqual(
+            [mem.shape for mem in result.mems],
+            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids_1,
+            input_ids_2,
+            input_ids_q,
+            perm_mask,
+            input_mask,
+            target_mapping,
+            segment_ids,
+            lm_labels,
+            sequence_labels,
+            is_impossible_labels,
+            token_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids_1}
+        return config, inputs_dict
+
+
+@require_torch
+class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            XLNetModel,
+            XLNetLMHeadModel,
+            XLNetForTokenClassification,
+            XLNetForSequenceClassification,
+            XLNetForQuestionAnswering,
+            XLNetForQuestionAnsweringSimple,
+            XLNetForMultipleChoice,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (XLNetLMHeadModel,) if is_torch_available() else ()
+    )  # TODO (PVP): Check other models whether language generation is also applicable
+    test_pruning = False
+    test_sequence_classification_problem_types = True
+
+    # XLNet has 2 QA models -> need to manually set the correct labels for one of them here
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class.__name__ == "XLNetForQuestionAnswering":
+                inputs_dict["start_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+                inputs_dict["end_positions"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = XLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+
+    def test_xlnet_base_model_use_mems(self):
+        # checking that in auto-regressive mode, :obj:`use_mems` gives the same results
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_model_use_mems(*config_and_inputs)
+
+    def test_seq_classification_use_mems_train(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_use_mems_train(*config_and_inputs)
+
+    def test_xlnet_base_model_with_att_output(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model_with_att_output(*config_and_inputs)
+
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
+
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+
+    def test_xlnet_token_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_token_classif(*config_and_inputs)
+
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
+
+    def test_retain_grad_hidden_states_attentions(self):
+        # xlnet cannot keep gradients in attentions or hidden states
+        return
+
+    # overwrite from test_modeling_common
+    def _mock_init_weights(self, module):
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(3)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.fill_(3)
+
+        for param in ["q", "k", "v", "o", "r", "r_r_bias", "r_s_bias", "r_w_bias", "seg_embed", "mask_emb"]:
+            if hasattr(module, param) and getattr(module, param) is not None:
+                weight = getattr(module, param)
+                weight.data.fill_(3)
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            # check hidden size
+            for i, layer_hidden_states in enumerate(iter_hidden_states):
+                # every 2nd tensor is from extra stream
+                if i % 2 != 0:
+                    seq_len = 1
+                else:
+                    # for first item dummy PAD token is appended so need one more
+                    seq_len = (min_length + 1) if idx == 0 else min_length
+
+                expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+                self.assertEqual(layer_hidden_states.shape, expected_shape)
+
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+
+        for idx, attentions_item in enumerate(attentions):
+            for iter_attentions in attentions_item:
+                tgt_len = min_length
+
+                # for first item dummy PAD token is appended so need one more
+                if idx == 0:
+                    tgt_len += 1
+
+                src_len = min_length + idx + 1
+
+                expected_shape = (
+                    batch_size * num_beam_groups,
+                    config.num_attention_heads,
+                    tgt_len,
+                    src_len,
+                )
+                # check attn size
+                self.assertListEqual(
+                    [layer_attention.shape for layer_attention in iter_attentions],
+                    [expected_shape] * len(iter_attentions),
+                )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in XLNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = XLNetModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class XLNetModelLanguageGenerationTest(unittest.TestCase):
+    @slow
+    def test_lm_generate_xlnet_base_cased(self):
+        model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
+        model.to(torch_device)
+        input_ids = torch.tensor(
+            [
+                [
+                    67,
+                    2840,
+                    19,
+                    18,
+                    1484,
+                    20,
+                    965,
+                    29077,
+                    8719,
+                    1273,
+                    21,
+                    45,
+                    273,
+                    17,
+                    10,
+                    15048,
+                    28,
+                    27511,
+                    21,
+                    4185,
+                    11,
+                    41,
+                    2444,
+                    9,
+                    32,
+                    1025,
+                    20,
+                    8719,
+                    26,
+                    23,
+                    673,
+                    966,
+                    19,
+                    29077,
+                    20643,
+                    27511,
+                    20822,
+                    20643,
+                    19,
+                    17,
+                    6616,
+                    17511,
+                    18,
+                    8978,
+                    20,
+                    18,
+                    777,
+                    9,
+                    19233,
+                    1527,
+                    17669,
+                    19,
+                    24,
+                    673,
+                    17,
+                    28756,
+                    150,
+                    12943,
+                    4354,
+                    153,
+                    27,
+                    442,
+                    37,
+                    45,
+                    668,
+                    21,
+                    24,
+                    256,
+                    20,
+                    416,
+                    22,
+                    2771,
+                    4901,
+                    9,
+                    12943,
+                    4354,
+                    153,
+                    51,
+                    24,
+                    3004,
+                    21,
+                    28142,
+                    23,
+                    65,
+                    20,
+                    18,
+                    416,
+                    34,
+                    24,
+                    2958,
+                    22947,
+                    9,
+                    1177,
+                    45,
+                    668,
+                    3097,
+                    13768,
+                    23,
+                    103,
+                    28,
+                    441,
+                    148,
+                    48,
+                    20522,
+                    19,
+                    12943,
+                    4354,
+                    153,
+                    12860,
+                    34,
+                    18,
+                    326,
+                    27,
+                    17492,
+                    684,
+                    21,
+                    6709,
+                    9,
+                    8585,
+                    123,
+                    266,
+                    19,
+                    12943,
+                    4354,
+                    153,
+                    6872,
+                    24,
+                    3004,
+                    20,
+                    18,
+                    9225,
+                    2198,
+                    19,
+                    12717,
+                    103,
+                    22,
+                    401,
+                    24,
+                    6348,
+                    9,
+                    12943,
+                    4354,
+                    153,
+                    1068,
+                    2768,
+                    2286,
+                    19,
+                    33,
+                    104,
+                    19,
+                    176,
+                    24,
+                    9313,
+                    19,
+                    20086,
+                    28,
+                    45,
+                    10292,
+                    9,
+                    4,
+                    3,
+                ]
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+        #  In 1991, the remains of Russian Tsar Nicholas II and his family
+        #  (except for Alexei and Maria) are discovered.
+        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+        #  remainder of the story. 1883 Western Siberia,
+        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
+        #  father initially slaps him for making such an accusation, Rasputin watches as the
+        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+        #  with people, even a bishop, begging for his blessing. """
+
+        expected_output_ids = [
+            67,
+            2840,
+            19,
+            18,
+            1484,
+            20,
+            965,
+            29077,
+            8719,
+            1273,
+            21,
+            45,
+            273,
+            17,
+            10,
+            15048,
+            28,
+            27511,
+            21,
+            4185,
+            11,
+            41,
+            2444,
+            9,
+            32,
+            1025,
+            20,
+            8719,
+            26,
+            23,
+            673,
+            966,
+            19,
+            29077,
+            20643,
+            27511,
+            20822,
+            20643,
+            19,
+            17,
+            6616,
+            17511,
+            18,
+            8978,
+            20,
+            18,
+            777,
+            9,
+            19233,
+            1527,
+            17669,
+            19,
+            24,
+            673,
+            17,
+            28756,
+            150,
+            12943,
+            4354,
+            153,
+            27,
+            442,
+            37,
+            45,
+            668,
+            21,
+            24,
+            256,
+            20,
+            416,
+            22,
+            2771,
+            4901,
+            9,
+            12943,
+            4354,
+            153,
+            51,
+            24,
+            3004,
+            21,
+            28142,
+            23,
+            65,
+            20,
+            18,
+            416,
+            34,
+            24,
+            2958,
+            22947,
+            9,
+            1177,
+            45,
+            668,
+            3097,
+            13768,
+            23,
+            103,
+            28,
+            441,
+            148,
+            48,
+            20522,
+            19,
+            12943,
+            4354,
+            153,
+            12860,
+            34,
+            18,
+            326,
+            27,
+            17492,
+            684,
+            21,
+            6709,
+            9,
+            8585,
+            123,
+            266,
+            19,
+            12943,
+            4354,
+            153,
+            6872,
+            24,
+            3004,
+            20,
+            18,
+            9225,
+            2198,
+            19,
+            12717,
+            103,
+            22,
+            401,
+            24,
+            6348,
+            9,
+            12943,
+            4354,
+            153,
+            1068,
+            2768,
+            2286,
+            19,
+            33,
+            104,
+            19,
+            176,
+            24,
+            9313,
+            19,
+            20086,
+            28,
+            45,
+            10292,
+            9,
+            4,
+            3,
+            19,
+            12943,
+            4354,
+            153,
+            27,
+            442,
+            22,
+            2771,
+            4901,
+            9,
+            69,
+            27,
+            442,
+            22,
+            2771,
+            24,
+            11335,
+            20,
+            18,
+            9225,
+            2198,
+            9,
+            69,
+            27,
+            442,
+            22,
+            2771,
+            24,
+            11335,
+            20,
+            18,
+            9225,
+            2198,
+            9,
+            69,
+            27,
+            442,
+            22,
+            2771,
+        ]
+        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
+        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
+        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
+        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
+        #  denounces one of the men as a horse thief. Although his father initially slaps
+        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
+        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
+        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
+        #  <sep><cls>, Rasputin is asked to perform magic. He is asked to perform a ritual of the Virgin Mary.
+        #  He is asked to perform a ritual of the Virgin Mary. He is asked to perform
+
+        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
+        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/test_offline.py b/test_offline.py
new file mode 100644
index 0000000000000000000000000000000000000000..45a12a1f2b99da0f75808e94c4e668f2617cdced
--- /dev/null
+++ b/test_offline.py
@@ -0,0 +1,71 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import sys
+
+from transformers.testing_utils import TestCasePlus, require_torch
+
+
+class OfflineTests(TestCasePlus):
+    @require_torch
+    def test_offline_mode(self):
+
+        # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
+        # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
+        # while running an external program
+
+        # python one-liner segments
+
+        # this must be loaded before socket.socket is monkey-patched
+        load = """
+from transformers import BertConfig, BertModel, BertTokenizer
+        """
+
+        run = """
+mname = "lysandre/tiny-bert-random"
+BertConfig.from_pretrained(mname)
+BertModel.from_pretrained(mname)
+BertTokenizer.from_pretrained(mname)
+print("success")
+        """
+
+        mock = """
+import socket
+def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled")
+socket.socket = offline_socket
+        """
+
+        # baseline - just load from_pretrained with normal network
+        cmd = [sys.executable, "-c", "\n".join([load, run])]
+
+        # should succeed
+        env = self.get_env()
+        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        self.assertEqual(result.returncode, 0, result.stderr)
+        self.assertIn("success", result.stdout.decode())
+
+        # next emulate no network
+        cmd = [sys.executable, "-c", "\n".join([load, mock, run])]
+
+        # should normally fail as it will fail to lookup the model files w/o the network
+        env["TRANSFORMERS_OFFLINE"] = "0"
+        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        self.assertEqual(result.returncode, 1, result.stderr)
+
+        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
+        env["TRANSFORMERS_OFFLINE"] = "1"
+        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        self.assertEqual(result.returncode, 0, result.stderr)
+        self.assertIn("success", result.stdout.decode())
diff --git a/test_onnx.py b/test_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1fc6ac45428351fbb17826b8016b5d63681ec6
--- /dev/null
+++ b/test_onnx.py
@@ -0,0 +1,195 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+
+from transformers import BertConfig, BertTokenizerFast, FeatureExtractionPipeline
+from transformers.convert_graph_to_onnx import (
+    convert,
+    ensure_valid_input,
+    generate_identified_filename,
+    infer_shapes,
+    quantize,
+)
+from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
+
+
+class FuncContiguousArgs:
+    def forward(self, input_ids, token_type_ids, attention_mask):
+        return None
+
+
+class FuncNonContiguousArgs:
+    def forward(self, input_ids, some_other_args, token_type_ids, attention_mask):
+        return None
+
+
+class OnnxExportTestCase(unittest.TestCase):
+    MODEL_TO_TEST = [
+        # (model_name, model_kwargs)
+        ("bert-base-cased", {}),
+        ("gpt2", {"use_cache": False}),  # We don't support exporting GPT2 past keys anymore
+    ]
+
+    @require_tf
+    @slow
+    def test_export_tensorflow(self):
+        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
+            self._test_export(model, "tf", 12, **model_kwargs)
+
+    @require_torch
+    @slow
+    def test_export_pytorch(self):
+        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
+            self._test_export(model, "pt", 12, **model_kwargs)
+
+    @require_torch
+    @slow
+    def test_export_custom_bert_model(self):
+        from transformers import BertModel
+
+        vocab = ["[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]", "some", "other", "words"]
+        with NamedTemporaryFile(mode="w+t") as vocab_file:
+            vocab_file.write("\n".join(vocab))
+            vocab_file.flush()
+            tokenizer = BertTokenizerFast(vocab_file.name)
+
+        with TemporaryDirectory() as bert_save_dir:
+            model = BertModel(BertConfig(vocab_size=len(vocab)))
+            model.save_pretrained(bert_save_dir)
+            self._test_export(bert_save_dir, "pt", 12, tokenizer)
+
+    @require_tf
+    @slow
+    def test_quantize_tf(self):
+        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
+            path = self._test_export(model, "tf", 12, **model_kwargs)
+            quantized_path = quantize(Path(path))
+
+            # Ensure the actual quantized model is not bigger than the original one
+            if quantized_path.stat().st_size >= Path(path).stat().st_size:
+                self.fail("Quantized model is bigger than initial ONNX model")
+
+    @require_torch
+    @slow
+    def test_quantize_pytorch(self):
+        for model, model_kwargs in OnnxExportTestCase.MODEL_TO_TEST:
+            path = self._test_export(model, "pt", 12, **model_kwargs)
+            quantized_path = quantize(path)
+
+            # Ensure the actual quantized model is not bigger than the original one
+            if quantized_path.stat().st_size >= Path(path).stat().st_size:
+                self.fail("Quantized model is bigger than initial ONNX model")
+
+    def _test_export(self, model, framework, opset, tokenizer=None, **model_kwargs):
+        try:
+            # Compute path
+            with TemporaryDirectory() as tempdir:
+                path = Path(tempdir).joinpath("model.onnx")
+
+            # Remove folder if exists
+            if path.parent.exists():
+                path.parent.rmdir()
+
+            # Export
+            convert(framework, model, path, opset, tokenizer, **model_kwargs)
+
+            return path
+        except Exception as e:
+            self.fail(e)
+
+    @require_torch
+    @require_tokenizers
+    @slow
+    def test_infer_dynamic_axis_pytorch(self):
+        """
+        Validate the dynamic axis generated for each parameters are correct
+        """
+        from transformers import BertModel
+
+        model = BertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
+        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
+        self._test_infer_dynamic_axis(model, tokenizer, "pt")
+
+    @require_tf
+    @require_tokenizers
+    @slow
+    def test_infer_dynamic_axis_tf(self):
+        """
+        Validate the dynamic axis generated for each parameters are correct
+        """
+        from transformers import TFBertModel
+
+        model = TFBertModel(BertConfig.from_pretrained("lysandre/tiny-bert-random"))
+        tokenizer = BertTokenizerFast.from_pretrained("lysandre/tiny-bert-random")
+        self._test_infer_dynamic_axis(model, tokenizer, "tf")
+
+    def _test_infer_dynamic_axis(self, model, tokenizer, framework):
+        feature_extractor = FeatureExtractionPipeline(model, tokenizer)
+
+        variable_names = ["input_ids", "token_type_ids", "attention_mask", "output_0", "output_1"]
+        input_vars, output_vars, shapes, tokens = infer_shapes(feature_extractor, framework)
+
+        # Assert all variables are present
+        self.assertEqual(len(shapes), len(variable_names))
+        self.assertTrue(all([var_name in shapes for var_name in variable_names]))
+        self.assertSequenceEqual(variable_names[:3], input_vars)
+        self.assertSequenceEqual(variable_names[3:], output_vars)
+
+        # Assert inputs are {0: batch, 1: sequence}
+        for var_name in ["input_ids", "token_type_ids", "attention_mask"]:
+            self.assertDictEqual(shapes[var_name], {0: "batch", 1: "sequence"})
+
+        # Assert outputs are {0: batch, 1: sequence} and {0: batch}
+        self.assertDictEqual(shapes["output_0"], {0: "batch", 1: "sequence"})
+        self.assertDictEqual(shapes["output_1"], {0: "batch"})
+
+    def test_ensure_valid_input(self):
+        """
+        Validate parameters are correctly exported
+        GPT2 has "past" parameter in the middle of input_ids, token_type_ids and attention_mask.
+        ONNX doesn't support export with a dictionary, only a tuple. Thus we need to ensure we remove
+        token_type_ids and attention_mask for now to not having a None tensor in the middle
+        """
+        # All generated args are valid
+        input_names = ["input_ids", "attention_mask", "token_type_ids"]
+        tokens = {"input_ids": [1, 2, 3, 4], "attention_mask": [0, 0, 0, 0], "token_type_ids": [1, 1, 1, 1]}
+        ordered_input_names, inputs_args = ensure_valid_input(FuncContiguousArgs(), tokens, input_names)
+
+        # Should have exactly the same number of args (all are valid)
+        self.assertEqual(len(inputs_args), 3)
+
+        # Should have exactly the same input names
+        self.assertEqual(set(ordered_input_names), set(input_names))
+
+        # Parameter should be reordered according to their respective place in the function:
+        # (input_ids, token_type_ids, attention_mask)
+        self.assertEqual(inputs_args, (tokens["input_ids"], tokens["token_type_ids"], tokens["attention_mask"]))
+
+        # Generated args are interleaved with another args (for instance parameter "past" in GPT2)
+        ordered_input_names, inputs_args = ensure_valid_input(FuncNonContiguousArgs(), tokens, input_names)
+
+        # Should have exactly the one arg (all before the one not provided "some_other_args")
+        self.assertEqual(len(inputs_args), 1)
+        self.assertEqual(len(ordered_input_names), 1)
+
+        # Should have only "input_ids"
+        self.assertEqual(inputs_args[0], tokens["input_ids"])
+        self.assertEqual(ordered_input_names[0], "input_ids")
+
+    def test_generate_identified_name(self):
+        generated = generate_identified_filename(Path("/home/something/my_fake_model.onnx"), "-test")
+        self.assertEqual("/home/something/my_fake_model-test.onnx", generated.as_posix())
diff --git a/test_optimization.py b/test_optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c5a31a3a49de2a208c937cd4e23dfecc343004
--- /dev/null
+++ b/test_optimization.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import tempfile
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        Adafactor,
+        AdamW,
+        get_constant_schedule,
+        get_constant_schedule_with_warmup,
+        get_cosine_schedule_with_warmup,
+        get_cosine_with_hard_restarts_schedule_with_warmup,
+        get_linear_schedule_with_warmup,
+        get_polynomial_decay_schedule_with_warmup,
+    )
+
+
+def unwrap_schedule(scheduler, num_steps=10):
+    lrs = []
+    for _ in range(num_steps):
+        lrs.append(scheduler.get_lr()[0])
+        scheduler.step()
+    return lrs
+
+
+def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
+    lrs = []
+    for step in range(num_steps):
+        lrs.append(scheduler.get_lr()[0])
+        scheduler.step()
+        if step == num_steps // 2:
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                file_name = os.path.join(tmpdirname, "schedule.bin")
+                torch.save(scheduler.state_dict(), file_name)
+
+                state_dict = torch.load(file_name)
+                scheduler.load_state_dict(state_dict)
+    return lrs
+
+
+@require_torch
+class OptimizationTest(unittest.TestCase):
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
+
+    def test_adam_w(self):
+        w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
+        target = torch.tensor([0.4, 0.2, -0.5])
+        criterion = nn.MSELoss()
+        # No warmup, constant schedule, no gradient clipping
+        optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
+        for _ in range(100):
+            loss = criterion(w, target)
+            loss.backward()
+            optimizer.step()
+            w.grad.detach_()  # No zero_grad() function on simple tensors. we do it ourselves.
+            w.grad.zero_()
+        self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
+
+    def test_adafactor(self):
+        w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
+        target = torch.tensor([0.4, 0.2, -0.5])
+        criterion = nn.MSELoss()
+        # No warmup, constant schedule, no gradient clipping
+        optimizer = Adafactor(
+            params=[w],
+            lr=1e-2,
+            eps=(1e-30, 1e-3),
+            clip_threshold=1.0,
+            decay_rate=-0.8,
+            beta1=None,
+            weight_decay=0.0,
+            relative_step=False,
+            scale_parameter=False,
+            warmup_init=False,
+        )
+        for _ in range(1000):
+            loss = criterion(w, target)
+            loss.backward()
+            optimizer.step()
+            w.grad.detach_()  # No zero_grad() function on simple tensors. we do it ourselves.
+            w.grad.zero_()
+        self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
+
+
+@require_torch
+class ScheduleInitTest(unittest.TestCase):
+    m = nn.Linear(50, 50) if is_torch_available() else None
+    optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
+    num_steps = 10
+
+    def assertListAlmostEqual(self, list1, list2, tol, msg=None):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol, msg=msg)
+
+    def test_schedulers(self):
+
+        common_kwargs = {"num_warmup_steps": 2, "num_training_steps": 10}
+        # schedulers doct format
+        # function: (sched_args_dict, expected_learning_rates)
+        scheds = {
+            get_constant_schedule: ({}, [10.0] * self.num_steps),
+            get_constant_schedule_with_warmup: (
+                {"num_warmup_steps": 4},
+                [0.0, 2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0],
+            ),
+            get_linear_schedule_with_warmup: (
+                {**common_kwargs},
+                [0.0, 5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25],
+            ),
+            get_cosine_schedule_with_warmup: (
+                {**common_kwargs},
+                [0.0, 5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38],
+            ),
+            get_cosine_with_hard_restarts_schedule_with_warmup: (
+                {**common_kwargs, "num_cycles": 2},
+                [0.0, 5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46],
+            ),
+            get_polynomial_decay_schedule_with_warmup: (
+                {**common_kwargs, "power": 2.0, "lr_end": 1e-7},
+                [0.0, 5.0, 10.0, 7.656, 5.625, 3.906, 2.5, 1.406, 0.625, 0.156],
+            ),
+        }
+
+        for scheduler_func, data in scheds.items():
+            kwargs, expected_learning_rates = data
+
+            scheduler = scheduler_func(self.optimizer, **kwargs)
+            self.assertEqual(len([scheduler.get_lr()[0]]), 1)
+            lrs_1 = unwrap_schedule(scheduler, self.num_steps)
+            self.assertListAlmostEqual(
+                lrs_1,
+                expected_learning_rates,
+                tol=1e-2,
+                msg=f"failed for {scheduler_func} in normal scheduler",
+            )
+
+            scheduler = scheduler_func(self.optimizer, **kwargs)
+            lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
+            self.assertListEqual(lrs_1, lrs_2, msg=f"failed for {scheduler_func} in save and reload")
diff --git a/test_optimization_tf.py b/test_optimization_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3a948c938dfdc440cae1ad16c0e5c25056b1aa3
--- /dev/null
+++ b/test_optimization_tf.py
@@ -0,0 +1,100 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from tensorflow.python.eager import context
+    from tensorflow.python.framework import ops
+
+    from transformers import GradientAccumulator, create_optimizer
+
+
+@require_tf
+class OptimizationFTest(unittest.TestCase):
+    def assertListAlmostEqual(self, list1, list2, tol):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
+
+    def testGradientAccumulator(self):
+        accumulator = GradientAccumulator()
+        accumulator([tf.constant([1.0, 2.0])])
+        accumulator([tf.constant([-2.0, 1.0])])
+        accumulator([tf.constant([-1.0, 2.0])])
+        with self.assertRaises(ValueError):
+            accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
+        self.assertEqual(accumulator.step, 3)
+        self.assertEqual(len(accumulator.gradients), 1)
+        self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
+        accumulator.reset()
+        self.assertEqual(accumulator.step, 0)
+        self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
+
+    def testGradientAccumulatorDistributionStrategy(self):
+        context._context = None
+        ops.enable_eager_execution_internal()
+        physical_devices = tf.config.list_physical_devices("CPU")
+        if len(physical_devices) == 1:
+            tf.config.set_logical_device_configuration(
+                physical_devices[0], [tf.config.LogicalDeviceConfiguration(), tf.config.LogicalDeviceConfiguration()]
+            )
+        devices = tf.config.list_logical_devices(device_type="CPU")
+        strategy = tf.distribute.MirroredStrategy(devices=devices[:2])
+
+        with strategy.scope():
+            accumulator = GradientAccumulator()
+            variable = tf.Variable([4.0, 3.0])
+            optimizer, _ = create_optimizer(5e-5, 10, 5)
+            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
+
+        def accumulate_on_replica(gradient):
+            accumulator([gradient])
+
+        def apply_on_replica():
+            optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])))
+
+        @tf.function
+        def accumulate(grad1, grad2):
+            with strategy.scope():
+                local_variables = strategy.experimental_local_results(gradient_placeholder)
+                local_variables[0].assign(grad1)
+                local_variables[1].assign(grad2)
+                strategy.run(accumulate_on_replica, args=(gradient_placeholder,))
+
+        @tf.function
+        def apply_grad():
+            with strategy.scope():
+                strategy.run(apply_on_replica)
+
+        def _check_local_values(grad1, grad2):
+            values = strategy.experimental_local_results(accumulator._gradients[0])
+            self.assertListAlmostEqual(values[0].value(), grad1, tol=1e-2)
+            self.assertListAlmostEqual(values[1].value(), grad2, tol=1e-2)
+
+        accumulate([1.0, 2.0], [-1.0, 1.0])
+        accumulate([3.0, -1.0], [-1.0, -1.0])
+        accumulate([-2.0, 2.0], [3.0, -2.0])
+        self.assertEqual(accumulator.step, 3)
+        _check_local_values([2.0, 3.0], [1.0, -2.0])
+        apply_grad()
+        self.assertListAlmostEqual(variable.value(), [4.0, 3.0], tol=1e-2)
+        accumulator.reset()
+        self.assertEqual(accumulator.step, 0)
+        _check_local_values([0.0, 0.0], [0.0, 0.0])
diff --git a/test_pipelines_automatic_speech_recognition.py b/test_pipelines_automatic_speech_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..91dcc71de0182742124b168ffe3ab197d71b5119
--- /dev/null
+++ b/test_pipelines_automatic_speech_recognition.py
@@ -0,0 +1,89 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AutoFeatureExtractor, AutoTokenizer, Speech2TextForConditionalGeneration, Wav2Vec2ForCTC
+from transformers.pipelines import AutomaticSpeechRecognitionPipeline
+from transformers.testing_utils import require_datasets, require_torch, require_torchaudio, slow
+
+
+# from .test_pipelines_common import CustomInputPipelineCommonMixin
+
+
+class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
+    # pipeline_task = "automatic-speech-recognition"
+    # small_models = ["facebook/s2t-small-mustc-en-fr-st"]  # Models tested without the @slow decorator
+    # large_models = [
+    #     "facebook/wav2vec2-base-960h",
+    #     "facebook/s2t-small-mustc-en-fr-st",
+    # ]  # Models tested with the @slow decorator
+
+    @slow
+    @require_torch
+    @require_datasets
+    def test_simple_wav2vec2(self):
+        import numpy as np
+        from datasets import load_dataset
+
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+
+        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        waveform = np.zeros((34000,))
+        output = asr(waveform)
+        self.assertEqual(output, {"text": ""})
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        filename = ds[0]["file"]
+        output = asr(filename)
+        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
+
+        filename = ds[0]["file"]
+        with open(filename, "rb") as f:
+            data = f.read()
+        output = asr(data)
+        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
+
+    @slow
+    @require_torch
+    @require_torchaudio
+    @require_datasets
+    def test_simple_s2t(self):
+        import numpy as np
+        from datasets import load_dataset
+
+        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st")
+        tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st")
+
+        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        waveform = np.zeros((34000,))
+
+        output = asr(waveform)
+        self.assertEqual(output, {"text": "E questo è il motivo per cui non ci siamo mai incontrati."})
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        filename = ds[0]["file"]
+        output = asr(filename)
+        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
+
+        filename = ds[0]["file"]
+        with open(filename, "rb") as f:
+            data = f.read()
+        output = asr(data)
+        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
diff --git a/test_pipelines_common.py b/test_pipelines_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5468e4742794602a9575e7824560f62d62a2638d
--- /dev/null
+++ b/test_pipelines_common.py
@@ -0,0 +1,246 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+from unittest import mock
+
+from transformers import is_tf_available, is_torch_available, pipeline
+from transformers.file_utils import to_py_obj
+from transformers.pipelines import Pipeline
+from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
+
+
+VALID_INPUTS = ["A simple string", ["list of strings"]]
+
+
+@is_pipeline_test
+class CustomInputPipelineCommonMixin:
+    pipeline_task = None
+    pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
+    pipeline_running_kwargs = {}  # Additional kwargs to run the pipeline with
+    small_models = []  # Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+    valid_inputs = VALID_INPUTS  # Some inputs which are valid to compare fast and slow tokenizers
+
+    def setUp(self) -> None:
+        if not is_tf_available() and not is_torch_available():
+            return  # Currently no JAX pipelines
+
+        # Download needed checkpoints
+        models = self.small_models
+        if _run_slow_tests:
+            models = models + self.large_models
+
+        for model_name in models:
+            if is_torch_available():
+                pipeline(
+                    self.pipeline_task,
+                    model=model_name,
+                    tokenizer=model_name,
+                    framework="pt",
+                    **self.pipeline_loading_kwargs,
+                )
+            if is_tf_available():
+                pipeline(
+                    self.pipeline_task,
+                    model=model_name,
+                    tokenizer=model_name,
+                    framework="tf",
+                    **self.pipeline_loading_kwargs,
+                )
+
+    @require_torch
+    @slow
+    def test_pt_defaults(self):
+        pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
+
+    @require_tf
+    @slow
+    def test_tf_defaults(self):
+        pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
+
+    @require_torch
+    def test_torch_small(self):
+        for model_name in self.small_models:
+            pipe_small = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                **self.pipeline_loading_kwargs,
+            )
+            self._test_pipeline(pipe_small)
+
+    @require_tf
+    def test_tf_small(self):
+        for model_name in self.small_models:
+            pipe_small = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                **self.pipeline_loading_kwargs,
+            )
+            self._test_pipeline(pipe_small)
+
+    @require_torch
+    @slow
+    def test_torch_large(self):
+        for model_name in self.large_models:
+            pipe_large = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                **self.pipeline_loading_kwargs,
+            )
+            self._test_pipeline(pipe_large)
+
+    @require_tf
+    @slow
+    def test_tf_large(self):
+        for model_name in self.large_models:
+            pipe_large = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                **self.pipeline_loading_kwargs,
+            )
+            self._test_pipeline(pipe_large)
+
+    def _test_pipeline(self, pipe: Pipeline):
+        raise NotImplementedError
+
+    @require_torch
+    def test_compare_slow_fast_torch(self):
+        for model_name in self.small_models:
+            pipe_slow = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                use_fast=False,
+                **self.pipeline_loading_kwargs,
+            )
+            pipe_fast = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                use_fast=True,
+                **self.pipeline_loading_kwargs,
+            )
+            self._compare_slow_fast_pipelines(pipe_slow, pipe_fast, method="forward")
+
+    @require_tf
+    def test_compare_slow_fast_tf(self):
+        for model_name in self.small_models:
+            pipe_slow = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                use_fast=False,
+                **self.pipeline_loading_kwargs,
+            )
+            pipe_fast = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                use_fast=True,
+                **self.pipeline_loading_kwargs,
+            )
+            self._compare_slow_fast_pipelines(pipe_slow, pipe_fast, method="call")
+
+    def _compare_slow_fast_pipelines(self, pipe_slow: Pipeline, pipe_fast: Pipeline, method: str):
+        """We check that the inputs to the models forward passes are identical for
+        slow and fast tokenizers.
+        """
+        with mock.patch.object(
+            pipe_slow.model, method, wraps=getattr(pipe_slow.model, method)
+        ) as mock_slow, mock.patch.object(
+            pipe_fast.model, method, wraps=getattr(pipe_fast.model, method)
+        ) as mock_fast:
+            for inputs in self.valid_inputs:
+                if isinstance(inputs, dict):
+                    inputs.update(self.pipeline_running_kwargs)
+                    _ = pipe_slow(**inputs)
+                    _ = pipe_fast(**inputs)
+                else:
+                    _ = pipe_slow(inputs, **self.pipeline_running_kwargs)
+                    _ = pipe_fast(inputs, **self.pipeline_running_kwargs)
+
+                mock_slow.assert_called()
+                mock_fast.assert_called()
+
+                self.assertEqual(len(mock_slow.call_args_list), len(mock_fast.call_args_list))
+                for mock_slow_call_args, mock_fast_call_args in zip(
+                    mock_slow.call_args_list, mock_slow.call_args_list
+                ):
+                    slow_call_args, slow_call_kwargs = mock_slow_call_args
+                    fast_call_args, fast_call_kwargs = mock_fast_call_args
+
+                    slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
+                    fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
+
+                    self.assertEqual(slow_call_args, fast_call_args)
+                    self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
+
+
+@is_pipeline_test
+class MonoInputPipelineCommonMixin(CustomInputPipelineCommonMixin):
+    """A version of the CustomInputPipelineCommonMixin
+    with a predefined `_test_pipeline` method.
+    """
+
+    mandatory_keys = {}  # Keys which should be in the output
+    invalid_inputs = [None]  # inputs which are not allowed
+    expected_multi_result: Optional[List] = None
+    expected_check_keys: Optional[List[str]] = None
+
+    def _test_pipeline(self, pipe: Pipeline):
+        self.assertIsNotNone(pipe)
+
+        mono_result = pipe(self.valid_inputs[0], **self.pipeline_running_kwargs)
+        self.assertIsInstance(mono_result, list)
+        self.assertIsInstance(mono_result[0], (dict, list))
+
+        if isinstance(mono_result[0], list):
+            mono_result = mono_result[0]
+
+        for key in self.mandatory_keys:
+            self.assertIn(key, mono_result[0])
+
+        multi_result = [pipe(input, **self.pipeline_running_kwargs) for input in self.valid_inputs]
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], (dict, list))
+
+        if self.expected_multi_result is not None:
+            for result, expect in zip(multi_result, self.expected_multi_result):
+                for key in self.expected_check_keys or []:
+                    self.assertEqual(
+                        set([o[key] for o in result]),
+                        set([o[key] for o in expect]),
+                    )
+
+        if isinstance(multi_result[0], list):
+            multi_result = multi_result[0]
+
+        for result in multi_result:
+            for key in self.mandatory_keys:
+                self.assertIn(key, result)
+
+        self.assertRaises(Exception, pipe, self.invalid_inputs)
diff --git a/test_pipelines_conversational.py b/test_pipelines_conversational.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfc9512e85bf53ea2119e720f5721a107b1626e5
--- /dev/null
+++ b/test_pipelines_conversational.py
@@ -0,0 +1,423 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    BlenderbotSmallForConditionalGeneration,
+    BlenderbotSmallTokenizer,
+    Conversation,
+    ConversationalPipeline,
+    is_torch_available,
+    pipeline,
+)
+from transformers.testing_utils import is_pipeline_test, require_torch, slow, torch_device
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel
+
+DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
+
+
+@is_pipeline_test
+class SimpleConversationPipelineTests(unittest.TestCase):
+    def get_pipeline(self):
+        # When
+        config = GPT2Config(
+            vocab_size=263,
+            n_ctx=128,
+            max_length=128,
+            n_embd=64,
+            n_layer=1,
+            n_head=8,
+            bos_token_id=256,
+            eos_token_id=257,
+        )
+        model = GPT2LMHeadModel(config)
+        # Force model output to be L
+        V, D = model.lm_head.weight.shape
+        bias = torch.zeros(V)
+        bias[76] = 1
+        weight = torch.zeros((V, D), requires_grad=True)
+
+        model.lm_head.bias = nn.Parameter(bias)
+        model.lm_head.weight = nn.Parameter(weight)
+
+        # # Created with:
+        # import tempfile
+
+        # from tokenizers import Tokenizer, models
+        # from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+        # vocab = [(chr(i), i) for i in range(256)]
+        # tokenizer = Tokenizer(models.Unigram(vocab))
+        # with tempfile.NamedTemporaryFile() as f:
+        #     tokenizer.save(f.name)
+        #     real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, eos_token="<eos>", bos_token="<bos>")
+
+        # real_tokenizer._tokenizer.save("dummy.json")
+        # Special tokens are automatically added at load time.
+        tokenizer = AutoTokenizer.from_pretrained("Narsil/small_conversational_test")
+        conversation_agent = pipeline(
+            task="conversational", device=DEFAULT_DEVICE_NUM, model=model, tokenizer=tokenizer
+        )
+        return conversation_agent
+
+    @require_torch
+    def test_integration_torch_conversation(self):
+        conversation_agent = self.get_pipeline()
+        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+        conversation_2 = Conversation("What's the last book you have read?")
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+
+        result = conversation_agent([conversation_1, conversation_2], max_length=48)
+
+        # Two conversations in one pass
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(
+            result,
+            [
+                Conversation(
+                    None,
+                    past_user_inputs=["Going to the movies tonight - any suggestions?"],
+                    generated_responses=["L"],
+                ),
+                Conversation(
+                    None, past_user_inputs=["What's the last book you have read?"], generated_responses=["L"]
+                ),
+            ],
+        )
+
+        # One conversation with history
+        conversation_2.add_user_input("Why do you recommend it?")
+        result = conversation_agent(conversation_2, max_length=64)
+
+        self.assertEqual(result, conversation_2)
+        self.assertEqual(
+            result,
+            Conversation(
+                None,
+                past_user_inputs=["What's the last book you have read?", "Why do you recommend it?"],
+                generated_responses=["L", "L"],
+            ),
+        )
+
+
+class ConversationalPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "conversational"
+    small_models = []  # Models tested without the @slow decorator
+    large_models = ["microsoft/DialoGPT-medium"]  # Models tested with the @slow decorator
+    invalid_inputs = ["Hi there!", Conversation()]
+
+    def _test_pipeline(
+        self, conversation_agent
+    ):  # override the default test method to check that the output is a `Conversation` object
+        self.assertIsNotNone(conversation_agent)
+
+        # We need to recreate conversation for successive tests to pass as
+        # Conversation objects get *consumed* by the pipeline
+        conversation = Conversation("Hi there!")
+        mono_result = conversation_agent(conversation)
+        self.assertIsInstance(mono_result, Conversation)
+
+        conversations = [Conversation("Hi there!"), Conversation("How are you?")]
+        multi_result = conversation_agent(conversations)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], Conversation)
+        # Conversation have been consumed and are not valid anymore
+        # Inactive conversations passed to the pipeline raise a ValueError
+        self.assertRaises(ValueError, conversation_agent, conversation)
+        self.assertRaises(ValueError, conversation_agent, conversations)
+
+        for bad_input in self.invalid_inputs:
+            self.assertRaises(Exception, conversation_agent, bad_input)
+        self.assertRaises(Exception, conversation_agent, self.invalid_inputs)
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation(self):
+        # When
+        conversation_agent = pipeline(task="conversational", device=DEFAULT_DEVICE_NUM)
+        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+        conversation_2 = Conversation("What's the last book you have read?")
+        # Then
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        # When
+        result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(len(result[0].past_user_inputs), 1)
+        self.assertEqual(len(result[1].past_user_inputs), 1)
+        self.assertEqual(len(result[0].generated_responses), 1)
+        self.assertEqual(len(result[1].generated_responses), 1)
+        self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?")
+        self.assertEqual(result[0].generated_responses[0], "The Big Lebowski")
+        self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?")
+        self.assertEqual(result[1].generated_responses[0], "The Last Question")
+        # When
+        conversation_2.add_user_input("Why do you recommend it?")
+        result = conversation_agent(conversation_2, do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, conversation_2)
+        self.assertEqual(len(result.past_user_inputs), 2)
+        self.assertEqual(len(result.generated_responses), 2)
+        self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?")
+        self.assertEqual(result.generated_responses[1], "It's a good book.")
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_truncated_history(self):
+        # When
+        conversation_agent = pipeline(task="conversational", min_length_for_response=24, device=DEFAULT_DEVICE_NUM)
+        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+        # Then
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        # When
+        result = conversation_agent(conversation_1, do_sample=False, max_length=36)
+        # Then
+        self.assertEqual(result, conversation_1)
+        self.assertEqual(len(result.past_user_inputs), 1)
+        self.assertEqual(len(result.generated_responses), 1)
+        self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?")
+        self.assertEqual(result.generated_responses[0], "The Big Lebowski")
+        # When
+        conversation_1.add_user_input("Is it an action movie?")
+        result = conversation_agent(conversation_1, do_sample=False, max_length=36)
+        # Then
+        self.assertEqual(result, conversation_1)
+        self.assertEqual(len(result.past_user_inputs), 2)
+        self.assertEqual(len(result.generated_responses), 2)
+        self.assertEqual(result.past_user_inputs[1], "Is it an action movie?")
+        self.assertEqual(result.generated_responses[1], "It's a comedy.")
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_dialogpt_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
+        model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+
+        conversation_1 = Conversation("hello")
+        inputs = conversation_agent._parse_and_tokenize([conversation_1])
+        self.assertEqual(inputs["input_ids"].tolist(), [[31373, 50256]])
+
+        conversation_2 = Conversation("how are you ?", past_user_inputs=["hello"], generated_responses=["Hi there!"])
+        inputs = conversation_agent._parse_and_tokenize([conversation_2])
+        self.assertEqual(
+            inputs["input_ids"].tolist(), [[31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256]]
+        )
+
+        inputs = conversation_agent._parse_and_tokenize([conversation_1, conversation_2])
+        self.assertEqual(
+            inputs["input_ids"].tolist(),
+            [
+                [31373, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
+                [31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256],
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_blenderbot_400M_input_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+
+        # test1
+        conversation_1 = Conversation("hello")
+        inputs = conversation_agent._parse_and_tokenize([conversation_1])
+        self.assertEqual(inputs["input_ids"].tolist(), [[1710, 86, 2]])
+
+        # test2
+        conversation_1 = Conversation(
+            "I like lasagne.",
+            past_user_inputs=["hello"],
+            generated_responses=[
+                " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie."
+            ],
+        )
+        inputs = conversation_agent._parse_and_tokenize([conversation_1])
+        self.assertEqual(
+            inputs["input_ids"].tolist(),
+            [
+                # This should be compared with the same conversation on ParlAI `safe_interactive` demo.
+                [
+                    1710,  # hello
+                    86,
+                    228,  # Double space
+                    228,
+                    946,
+                    304,
+                    398,
+                    6881,
+                    558,
+                    964,
+                    38,
+                    452,
+                    315,
+                    265,
+                    6252,
+                    452,
+                    322,
+                    968,
+                    6884,
+                    3146,
+                    278,
+                    306,
+                    265,
+                    617,
+                    87,
+                    388,
+                    75,
+                    341,
+                    286,
+                    521,
+                    21,
+                    228,  # Double space
+                    228,
+                    281,  # I like lasagne.
+                    398,
+                    6881,
+                    558,
+                    964,
+                    21,
+                    2,  # EOS
+                ]
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_blenderbot_400M(self):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
+
+        conversation_1 = Conversation("hello")
+        result = conversation_agent(
+            conversation_1,
+        )
+        self.assertEqual(
+            result.generated_responses[0],
+            # ParlAI implementation output, we have a different one, but it's our
+            # second best, you can check by using num_return_sequences=10
+            # " Hello! How are you? I'm just getting ready to go to work, how about you?",
+            " Hello! How are you doing today? I just got back from a walk with my dog.",
+        )
+
+        conversation_1 = Conversation("Lasagne   hello")
+        result = conversation_agent(conversation_1, encoder_no_repeat_ngram_size=3)
+        self.assertEqual(
+            result.generated_responses[0],
+            " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie.",
+        )
+
+        conversation_1 = Conversation(
+            "Lasagne   hello   Lasagne is my favorite Italian dish. Do you like lasagne?   I like lasagne."
+        )
+        result = conversation_agent(
+            conversation_1,
+            encoder_no_repeat_ngram_size=3,
+        )
+        self.assertEqual(
+            result.generated_responses[0],
+            " Me too. I like how it can be topped with vegetables, meats, and condiments.",
+        )
+
+    @require_torch
+    @slow
+    def test_integration_torch_conversation_encoder_decoder(self):
+        # When
+        tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot_small-90M")
+        conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer, device=DEFAULT_DEVICE_NUM)
+
+        conversation_1 = Conversation("My name is Sarah and I live in London")
+        conversation_2 = Conversation("Going to the movies tonight, What movie would you recommend? ")
+        # Then
+        self.assertEqual(len(conversation_1.past_user_inputs), 0)
+        self.assertEqual(len(conversation_2.past_user_inputs), 0)
+        # When
+        result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(len(result[0].past_user_inputs), 1)
+        self.assertEqual(len(result[1].past_user_inputs), 1)
+        self.assertEqual(len(result[0].generated_responses), 1)
+        self.assertEqual(len(result[1].generated_responses), 1)
+        self.assertEqual(result[0].past_user_inputs[0], "My name is Sarah and I live in London")
+        self.assertEqual(
+            result[0].generated_responses[0],
+            "hi sarah, i live in london as well. do you have any plans for the weekend?",
+        )
+        self.assertEqual(
+            result[1].past_user_inputs[0], "Going to the movies tonight, What movie would you recommend? "
+        )
+        self.assertEqual(
+            result[1].generated_responses[0], "i don't know... i'm not really sure. what movie are you going to see?"
+        )
+        # When
+        conversation_1.add_user_input("Not yet, what about you?")
+        conversation_2.add_user_input("What's your name?")
+        result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000)
+        # Then
+        self.assertEqual(result, [conversation_1, conversation_2])
+        self.assertEqual(len(result[0].past_user_inputs), 2)
+        self.assertEqual(len(result[1].past_user_inputs), 2)
+        self.assertEqual(len(result[0].generated_responses), 2)
+        self.assertEqual(len(result[1].generated_responses), 2)
+        self.assertEqual(result[0].past_user_inputs[1], "Not yet, what about you?")
+        self.assertEqual(result[0].generated_responses[1], "i don't have any plans yet. i'm not sure what to do yet.")
+        self.assertEqual(result[1].past_user_inputs[1], "What's your name?")
+        self.assertEqual(result[1].generated_responses[1], "i don't have a name, but i'm going to see a horror movie.")
+
+    @require_torch
+    @slow
+    def test_from_pipeline_conversation(self):
+        model_id = "facebook/blenderbot_small-90M"
+
+        # from model id
+        conversation_agent_from_model_id = pipeline("conversational", model=model_id, tokenizer=model_id)
+
+        # from model object
+        model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_id)
+        tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_id)
+        conversation_agent_from_model = pipeline("conversational", model=model, tokenizer=tokenizer)
+
+        conversation = Conversation("My name is Sarah and I live in London")
+        conversation_copy = Conversation("My name is Sarah and I live in London")
+
+        result_model_id = conversation_agent_from_model_id([conversation])
+        result_model = conversation_agent_from_model([conversation_copy])
+
+        # check for equality
+        self.assertEqual(
+            result_model_id.generated_responses[0],
+            "hi sarah, i live in london as well. do you have any plans for the weekend?",
+        )
+        self.assertEqual(
+            result_model_id.generated_responses[0],
+            result_model.generated_responses[0],
+        )
diff --git a/test_pipelines_feature_extraction.py b/test_pipelines_feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c372bda587d987480c8a929ec294ff7870dc8fc
--- /dev/null
+++ b/test_pipelines_feature_extraction.py
@@ -0,0 +1,26 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+class FeatureExtractionPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "feature-extraction"
+    small_models = [
+        "sshleifer/tiny-distilbert-base-cased"
+    ]  # Default model - Models tested without the @slow decorator
+    large_models = [None]  # Models tested with the @slow decorator
+    mandatory_keys = {}  # Keys which should be in the output
diff --git a/test_pipelines_fill_mask.py b/test_pipelines_fill_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..8865bae0c8aac0eac5d4aa54104f0ddd759ec2d3
--- /dev/null
+++ b/test_pipelines_fill_mask.py
@@ -0,0 +1,244 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import pipeline
+from transformers.testing_utils import require_tf, require_torch, slow
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+EXPECTED_FILL_MASK_RESULT = [
+    [
+        {"sequence": "My name is John", "score": 0.00782308354973793, "token": 610, "token_str": " John"},
+        {"sequence": "My name is Chris", "score": 0.007475061342120171, "token": 1573, "token_str": " Chris"},
+    ],
+    [
+        {
+            "sequence": "The largest city in France is Paris",
+            "score": 0.2510891854763031,
+            "token": 2201,
+            "token_str": " Paris",
+        },
+        {
+            "sequence": "The largest city in France is Lyon",
+            "score": 0.21418564021587372,
+            "token": 12790,
+            "token_str": " Lyon",
+        },
+    ],
+]
+
+EXPECTED_FILL_MASK_TARGET_RESULT = [EXPECTED_FILL_MASK_RESULT[0]]
+
+
+class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "fill-mask"
+    pipeline_loading_kwargs = {"top_k": 2}
+    small_models = ["sshleifer/tiny-distilroberta-base"]  # Models tested without the @slow decorator
+    large_models = ["distilroberta-base"]  # Models tested with the @slow decorator
+    mandatory_keys = {"sequence", "score", "token"}
+    valid_inputs = [
+        "My name is <mask>",
+        "The largest city in France is <mask>",
+    ]
+    invalid_inputs = [
+        "This is <mask> <mask>"  # More than 1 mask_token in the input is not supported
+        "This is"  # No mask_token is not supported
+    ]
+    expected_check_keys = ["sequence"]
+
+    @require_torch
+    def test_torch_fill_mask(self):
+        valid_inputs = "My name is <mask>"
+        unmasker = pipeline(task="fill-mask", model=self.small_models[0])
+        outputs = unmasker(valid_inputs)
+        self.assertIsInstance(outputs, list)
+
+        # This passes
+        outputs = unmasker(valid_inputs, targets=[" Patrick", " Clara"])
+        self.assertIsInstance(outputs, list)
+
+        # This used to fail with `cannot mix args and kwargs`
+        outputs = unmasker(valid_inputs, something=False)
+        self.assertIsInstance(outputs, list)
+
+    @require_torch
+    def test_torch_fill_mask_with_targets(self):
+        valid_inputs = ["My name is <mask>"]
+        valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
+        invalid_targets = [[], [""], ""]
+        for model_name in self.small_models:
+            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
+            for targets in valid_targets:
+                outputs = unmasker(valid_inputs, targets=targets)
+                self.assertIsInstance(outputs, list)
+                self.assertEqual(len(outputs), len(targets))
+            for targets in invalid_targets:
+                self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets)
+
+    @require_tf
+    def test_tf_fill_mask_with_targets(self):
+        valid_inputs = ["My name is <mask>"]
+        valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
+        invalid_targets = [[], [""], ""]
+        for model_name in self.small_models:
+            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
+            for targets in valid_targets:
+                outputs = unmasker(valid_inputs, targets=targets)
+                self.assertIsInstance(outputs, list)
+                self.assertEqual(len(outputs), len(targets))
+            for targets in invalid_targets:
+                self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets)
+
+    @require_torch
+    @slow
+    def test_torch_fill_mask_results(self):
+        mandatory_keys = {"sequence", "score", "token"}
+        valid_inputs = [
+            "My name is <mask>",
+            "The largest city in France is <mask>",
+        ]
+        valid_targets = [" Patrick", " Clara"]
+        for model_name in self.large_models:
+            unmasker = pipeline(
+                task="fill-mask",
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                top_k=2,
+            )
+
+            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
+            self.assertIsInstance(mono_result, list)
+            self.assertIsInstance(mono_result[0], dict)
+
+            for mandatory_key in mandatory_keys:
+                self.assertIn(mandatory_key, mono_result[0])
+
+            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
+            self.assertIsInstance(multi_result, list)
+            self.assertIsInstance(multi_result[0], (dict, list))
+
+            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_RESULT):
+                for r, e in zip(result, expected):
+                    self.assertEqual(r["sequence"], e["sequence"])
+                    self.assertEqual(r["token_str"], e["token_str"])
+                    self.assertEqual(r["token"], e["token"])
+                    self.assertAlmostEqual(r["score"], e["score"], places=3)
+
+            if isinstance(multi_result[0], list):
+                multi_result = multi_result[0]
+
+            for result in multi_result:
+                for key in mandatory_keys:
+                    self.assertIn(key, result)
+
+            self.assertRaises(Exception, unmasker, [None])
+
+            valid_inputs = valid_inputs[:1]
+            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
+            self.assertIsInstance(mono_result, list)
+            self.assertIsInstance(mono_result[0], dict)
+
+            for mandatory_key in mandatory_keys:
+                self.assertIn(mandatory_key, mono_result[0])
+
+            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
+            self.assertIsInstance(multi_result, list)
+            self.assertIsInstance(multi_result[0], (dict, list))
+
+            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_TARGET_RESULT):
+                for r, e in zip(result, expected):
+                    self.assertEqual(r["sequence"], e["sequence"])
+                    self.assertEqual(r["token_str"], e["token_str"])
+                    self.assertEqual(r["token"], e["token"])
+                    self.assertAlmostEqual(r["score"], e["score"], places=3)
+
+            if isinstance(multi_result[0], list):
+                multi_result = multi_result[0]
+
+            for result in multi_result:
+                for key in mandatory_keys:
+                    self.assertIn(key, result)
+
+            self.assertRaises(Exception, unmasker, [None])
+
+    @require_tf
+    @slow
+    def test_tf_fill_mask_results(self):
+        mandatory_keys = {"sequence", "score", "token"}
+        valid_inputs = [
+            "My name is <mask>",
+            "The largest city in France is <mask>",
+        ]
+        valid_targets = [" Patrick", " Clara"]
+        for model_name in self.large_models:
+            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2)
+
+            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
+            self.assertIsInstance(mono_result, list)
+            self.assertIsInstance(mono_result[0], dict)
+
+            for mandatory_key in mandatory_keys:
+                self.assertIn(mandatory_key, mono_result[0])
+
+            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
+            self.assertIsInstance(multi_result, list)
+            self.assertIsInstance(multi_result[0], (dict, list))
+
+            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_RESULT):
+                for r, e in zip(result, expected):
+                    self.assertEqual(r["sequence"], e["sequence"])
+                    self.assertEqual(r["token_str"], e["token_str"])
+                    self.assertEqual(r["token"], e["token"])
+                    self.assertAlmostEqual(r["score"], e["score"], places=3)
+
+            if isinstance(multi_result[0], list):
+                multi_result = multi_result[0]
+
+            for result in multi_result:
+                for key in mandatory_keys:
+                    self.assertIn(key, result)
+
+            self.assertRaises(Exception, unmasker, [None])
+
+            valid_inputs = valid_inputs[:1]
+            mono_result = unmasker(valid_inputs[0], targets=valid_targets)
+            self.assertIsInstance(mono_result, list)
+            self.assertIsInstance(mono_result[0], dict)
+
+            for mandatory_key in mandatory_keys:
+                self.assertIn(mandatory_key, mono_result[0])
+
+            multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
+            self.assertIsInstance(multi_result, list)
+            self.assertIsInstance(multi_result[0], (dict, list))
+
+            for result, expected in zip(multi_result, EXPECTED_FILL_MASK_TARGET_RESULT):
+                for r, e in zip(result, expected):
+                    self.assertEqual(r["sequence"], e["sequence"])
+                    self.assertEqual(r["token_str"], e["token_str"])
+                    self.assertEqual(r["token"], e["token"])
+                    self.assertAlmostEqual(r["score"], e["score"], places=3)
+
+            if isinstance(multi_result[0], list):
+                multi_result = multi_result[0]
+
+            for result in multi_result:
+                for key in mandatory_keys:
+                    self.assertIn(key, result)
+
+            self.assertRaises(Exception, unmasker, [None])
diff --git a/test_pipelines_image_classification.py b/test_pipelines_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecfab4c76dd1c2f033e9799c4796574bd1e6b422
--- /dev/null
+++ b/test_pipelines_image_classification.py
@@ -0,0 +1,130 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import (
+    AutoFeatureExtractor,
+    AutoModelForImageClassification,
+    PreTrainedTokenizer,
+    is_vision_available,
+)
+from transformers.pipelines import ImageClassificationPipeline, pipeline
+from transformers.testing_utils import require_torch, require_vision
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@require_vision
+@require_torch
+class ImageClassificationPipelineTests(unittest.TestCase):
+    pipeline_task = "image-classification"
+    small_models = ["lysandre/tiny-vit-random"]  # Models tested without the @slow decorator
+    valid_inputs = [
+        {"images": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+        {
+            "images": [
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+        },
+        {"images": "./tests/fixtures/tests_samples/COCO/000000039769.png"},
+        {
+            "images": [
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        },
+        {"images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")},
+        {
+            "images": [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            ]
+        },
+        {
+            "images": [
+                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+                "./tests/fixtures/tests_samples/COCO/000000039769.png",
+            ]
+        },
+    ]
+
+    def test_small_model_from_factory(self):
+        for small_model in self.small_models:
+
+            image_classifier = pipeline("image-classification", model=small_model)
+
+            for valid_input in self.valid_inputs:
+                output = image_classifier(**valid_input)
+                top_k = valid_input.get("top_k", 5)
+
+                def assert_valid_pipeline_output(pipeline_output):
+                    self.assertTrue(isinstance(pipeline_output, list))
+                    self.assertEqual(len(pipeline_output), top_k)
+                    for label_result in pipeline_output:
+                        self.assertTrue(isinstance(label_result, dict))
+                        self.assertIn("label", label_result)
+                        self.assertIn("score", label_result)
+
+                if isinstance(valid_input["images"], list):
+                    self.assertEqual(len(valid_input["images"]), len(output))
+                    for individual_output in output:
+                        assert_valid_pipeline_output(individual_output)
+                else:
+                    assert_valid_pipeline_output(output)
+
+    def test_small_model_from_pipeline(self):
+        for small_model in self.small_models:
+
+            model = AutoModelForImageClassification.from_pretrained(small_model)
+            feature_extractor = AutoFeatureExtractor.from_pretrained(small_model)
+            image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
+
+            for valid_input in self.valid_inputs:
+                output = image_classifier(**valid_input)
+                top_k = valid_input.get("top_k", 5)
+
+                def assert_valid_pipeline_output(pipeline_output):
+                    self.assertTrue(isinstance(pipeline_output, list))
+                    self.assertEqual(len(pipeline_output), top_k)
+                    for label_result in pipeline_output:
+                        self.assertTrue(isinstance(label_result, dict))
+                        self.assertIn("label", label_result)
+                        self.assertIn("score", label_result)
+
+                if isinstance(valid_input["images"], list):
+                    # When images are batched, pipeline output is a list of lists of dictionaries
+                    self.assertEqual(len(valid_input["images"]), len(output))
+                    for individual_output in output:
+                        assert_valid_pipeline_output(individual_output)
+                else:
+                    # When images are batched, pipeline output is a list of dictionaries
+                    assert_valid_pipeline_output(output)
+
+    def test_custom_tokenizer(self):
+        tokenizer = PreTrainedTokenizer()
+
+        # Assert that the pipeline can be initialized with a feature extractor that is not in any mapping
+        image_classifier = pipeline("image-classification", model=self.small_models[0], tokenizer=tokenizer)
+
+        self.assertIs(image_classifier.tokenizer, tokenizer)
diff --git a/test_pipelines_question_answering.py b/test_pipelines_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c0264068333e7e1d5ce0014de286f8768100969
--- /dev/null
+++ b/test_pipelines_question_answering.py
@@ -0,0 +1,250 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.data.processors.squad import SquadExample
+from transformers.pipelines import Pipeline, QuestionAnsweringArgumentHandler, pipeline
+from transformers.testing_utils import slow
+
+from .test_pipelines_common import CustomInputPipelineCommonMixin
+
+
+class QAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "question-answering"
+    pipeline_running_kwargs = {
+        "padding": "max_length",
+        "max_seq_len": 25,
+        "doc_stride": 5,
+    }  # Default is 'longest' but we use 'max_length' to test equivalence between slow/fast tokenizers
+    small_models = [
+        "sshleifer/tiny-distilbert-base-cased-distilled-squad"
+    ]  # Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+    valid_inputs = [
+        {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
+        {
+            "question": "In what field is HuggingFace working ?",
+            "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+        },
+        {
+            "question": ["In what field is HuggingFace working ?", "In what field is HuggingFace working ?"],
+            "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+        },
+        {
+            "question": ["In what field is HuggingFace working ?", "In what field is HuggingFace working ?"],
+            "context": [
+                "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+                "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+            ],
+        },
+    ]
+
+    def get_pipelines(self):
+        question_answering_pipelines = [
+            pipeline(
+                task=self.pipeline_task,
+                model=model,
+                tokenizer=model,
+                framework="pt",
+                **self.pipeline_loading_kwargs,
+            )
+            for model in self.small_models
+        ]
+        return question_answering_pipelines
+
+    @slow
+    def test_high_topk_small_context(self):
+        self.pipeline_running_kwargs.update({"topk": 20})
+        valid_inputs = [
+            {"question": "Where was HuggingFace founded ?", "context": "Paris"},
+        ]
+        question_answering_pipelines = self.get_pipelines()
+        output_keys = {"score", "answer", "start", "end"}
+        for question_answering_pipeline in question_answering_pipelines:
+            result = question_answering_pipeline(valid_inputs, **self.pipeline_running_kwargs)
+            self.assertIsInstance(result, dict)
+
+            for key in output_keys:
+                self.assertIn(key, result)
+
+    def _test_pipeline(self, question_answering_pipeline: Pipeline):
+        output_keys = {"score", "answer", "start", "end"}
+        valid_inputs = [
+            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
+            {
+                "question": "In what field is HuggingFace working ?",
+                "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
+            },
+        ]
+        invalid_inputs = [
+            {"question": "", "context": "This is a test to try empty question edge case"},
+            {"question": None, "context": "This is a test to try empty question edge case"},
+            {"question": "What is does with empty context ?", "context": ""},
+            {"question": "What is does with empty context ?", "context": None},
+        ]
+        self.assertIsNotNone(question_answering_pipeline)
+
+        mono_result = question_answering_pipeline(valid_inputs[0])
+        self.assertIsInstance(mono_result, dict)
+
+        for key in output_keys:
+            self.assertIn(key, mono_result)
+
+        multi_result = question_answering_pipeline(valid_inputs)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], dict)
+
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+        for bad_input in invalid_inputs:
+            self.assertRaises(ValueError, question_answering_pipeline, bad_input)
+        self.assertRaises(ValueError, question_answering_pipeline, invalid_inputs)
+
+    def test_argument_handler(self):
+        qa = QuestionAnsweringArgumentHandler()
+
+        Q = "Where was HuggingFace founded ?"
+        C = "HuggingFace was founded in Paris"
+
+        normalized = qa(Q, C)
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa(question=Q, context=C)
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa(question=Q, context=C)
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa(question=[Q, Q], context=C)
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 2)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa({"question": Q, "context": C})
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa([{"question": Q, "context": C}])
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa([{"question": Q, "context": C}, {"question": Q, "context": C}])
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 2)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa(X={"question": Q, "context": C})
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa(X=[{"question": Q, "context": C}])
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+        normalized = qa(data={"question": Q, "context": C})
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 1)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+    def test_argument_handler_error_handling(self):
+        qa = QuestionAnsweringArgumentHandler()
+
+        Q = "Where was HuggingFace founded ?"
+        C = "HuggingFace was founded in Paris"
+
+        with self.assertRaises(KeyError):
+            qa({"context": C})
+        with self.assertRaises(KeyError):
+            qa({"question": Q})
+        with self.assertRaises(KeyError):
+            qa([{"context": C}])
+        with self.assertRaises(ValueError):
+            qa(None, C)
+        with self.assertRaises(ValueError):
+            qa("", C)
+        with self.assertRaises(ValueError):
+            qa(Q, None)
+        with self.assertRaises(ValueError):
+            qa(Q, "")
+
+        with self.assertRaises(ValueError):
+            qa(question=None, context=C)
+        with self.assertRaises(ValueError):
+            qa(question="", context=C)
+        with self.assertRaises(ValueError):
+            qa(question=Q, context=None)
+        with self.assertRaises(ValueError):
+            qa(question=Q, context="")
+
+        with self.assertRaises(ValueError):
+            qa({"question": None, "context": C})
+        with self.assertRaises(ValueError):
+            qa({"question": "", "context": C})
+        with self.assertRaises(ValueError):
+            qa({"question": Q, "context": None})
+        with self.assertRaises(ValueError):
+            qa({"question": Q, "context": ""})
+
+        with self.assertRaises(ValueError):
+            qa([{"question": Q, "context": C}, {"question": None, "context": C}])
+        with self.assertRaises(ValueError):
+            qa([{"question": Q, "context": C}, {"question": "", "context": C}])
+
+        with self.assertRaises(ValueError):
+            qa([{"question": Q, "context": C}, {"question": Q, "context": None}])
+        with self.assertRaises(ValueError):
+            qa([{"question": Q, "context": C}, {"question": Q, "context": ""}])
+
+        with self.assertRaises(ValueError):
+            qa(question={"This": "Is weird"}, context="This is a context")
+
+        with self.assertRaises(ValueError):
+            qa(question=[Q, Q], context=[C, C, C])
+
+        with self.assertRaises(ValueError):
+            qa(question=[Q, Q, Q], context=[C, C])
+
+    def test_argument_handler_old_format(self):
+        qa = QuestionAnsweringArgumentHandler()
+
+        Q = "Where was HuggingFace founded ?"
+        C = "HuggingFace was founded in Paris"
+        # Backward compatibility for this
+        normalized = qa(question=[Q, Q], context=[C, C])
+        self.assertEqual(type(normalized), list)
+        self.assertEqual(len(normalized), 2)
+        self.assertEqual({type(el) for el in normalized}, {SquadExample})
+
+    def test_argument_handler_error_handling_odd(self):
+        qa = QuestionAnsweringArgumentHandler()
+        with self.assertRaises(ValueError):
+            qa(None)
+
+        with self.assertRaises(ValueError):
+            qa(Y=None)
+
+        with self.assertRaises(ValueError):
+            qa(1)
diff --git a/test_pipelines_summarization.py b/test_pipelines_summarization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc55e9915ac548a954269126d4060d364762b91
--- /dev/null
+++ b/test_pipelines_summarization.py
@@ -0,0 +1,103 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AutoTokenizer, is_torch_available, pipeline
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.tokenization_utils import TruncationStrategy
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers.models.bart import BartConfig, BartForConditionalGeneration
+
+DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0
+
+
+class SimpleSummarizationPipelineTests(unittest.TestCase):
+    @require_torch
+    def test_input_too_long(self):
+        torch.manual_seed(0)
+        config = BartConfig(
+            vocab_size=257,
+            d_model=32,
+            encoder_layers=1,
+            decoder_layers=1,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            # So any text > 4 should raise an exception
+            max_position_embeddings=4,
+            encoder_attention_heads=1,
+            decoder_attention_heads=1,
+            max_length=4,
+            min_length=1,
+            forced_eos_token_id=None,
+        )
+        model = BartForConditionalGeneration(config)
+        # Bias output towards L
+        V, C = model.lm_head.weight.shape
+
+        bias = torch.zeros(V)
+        bias[76] = 10
+
+        model.lm_head.bias = nn.Parameter(bias)
+
+        # # Generated with:
+        # import tempfile
+        # from tokenizers import Tokenizer, models
+        # from transformers import PreTrainedTokenizerFast
+        # model_max_length = 4
+        # vocab = [(chr(i), i) for i in range(256)]
+        # tokenizer = Tokenizer(models.Unigram(vocab))
+        # with tempfile.NamedTemporaryFile() as f:
+        #     tokenizer.save(f.name)
+        #     real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, model_max_length=model_max_length)
+        # real_tokenizer._tokenizer.save("tokenizer.json")
+        # # + add missing config.json with albert as model_type
+        tokenizer = AutoTokenizer.from_pretrained("Narsil/small_summarization_test")
+        summarizer = pipeline(task="summarization", model=model, tokenizer=tokenizer)
+
+        with self.assertLogs("transformers", level="WARNING"):
+            with self.assertRaises(IndexError):
+                _ = summarizer("This is a test")
+
+        output = summarizer("This is a test", truncation=TruncationStrategy.ONLY_FIRST)
+        # 2 is default BOS from Bart.
+        self.assertEqual(output, [{"summary_text": "\x02 L L L"}])
+
+
+class SummarizationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "summarization"
+    pipeline_running_kwargs = {"num_beams": 2, "min_length": 2, "max_length": 5}
+    small_models = [
+        "patrickvonplaten/t5-tiny-random",
+        "sshleifer/bart-tiny-random",
+    ]  # Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+    invalid_inputs = [4, "<mask>"]
+    mandatory_keys = ["summary_text"]
+
+    @require_torch
+    @slow
+    def test_integration_torch_summarization(self):
+        summarizer = pipeline(task="summarization", device=DEFAULT_DEVICE_NUM)
+        cnn_article = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        expected_cnn_summary = " The Palestinian Authority becomes the 123rd member of the International Criminal Court . The move gives the court jurisdiction over alleged crimes in Palestinian territories . Israel and the United States opposed the Palestinians' efforts to join the court . Rights group Human Rights Watch welcomes the move, says governments seeking to penalize Palestine should end pressure ."
+        result = summarizer(cnn_article)
+        self.assertEqual(result[0]["summary_text"], expected_cnn_summary)
diff --git a/test_pipelines_table_question_answering.py b/test_pipelines_table_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..24a2c6d163f804e1801331269e4ae7d1f6bd9944
--- /dev/null
+++ b/test_pipelines_table_question_answering.py
@@ -0,0 +1,280 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers.pipelines import Pipeline, pipeline
+from transformers.testing_utils import require_pandas, require_torch, require_torch_scatter, slow
+
+from .test_pipelines_common import CustomInputPipelineCommonMixin
+
+
+@require_torch_scatter
+@require_torch
+@require_pandas
+class TQAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "table-question-answering"
+    pipeline_running_kwargs = {
+        "padding": "max_length",
+    }
+    small_models = [
+        "lysandre/tiny-tapas-random-wtq",
+        "lysandre/tiny-tapas-random-sqa",
+    ]
+    large_models = ["google/tapas-base-finetuned-wtq"]  # Models tested with the @slow decorator
+    valid_inputs = [
+        {
+            "table": {
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            "query": "how many movies has george clooney played in?",
+        },
+        {
+            "table": {
+                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+                "age": ["56", "45", "59"],
+                "number of movies": ["87", "53", "69"],
+                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            },
+            "query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+        },
+        {
+            "table": {
+                "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                "Stars": ["36542", "4512", "3934"],
+                "Contributors": ["651", "77", "34"],
+                "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+            },
+            "query": [
+                "What repository has the largest number of stars?",
+                "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
+                "What is the number of repositories?",
+                "What is the average number of stars?",
+                "What is the total amount of stars?",
+            ],
+        },
+    ]
+
+    def _test_pipeline(self, table_querier: Pipeline):
+        output_keys = {"answer", "coordinates", "cells"}
+        valid_inputs = self.valid_inputs
+        invalid_inputs = [
+            {"query": "What does it do with empty context ?", "table": ""},
+            {"query": "What does it do with empty context ?", "table": None},
+        ]
+        self.assertIsNotNone(table_querier)
+
+        mono_result = table_querier(valid_inputs[0])
+        self.assertIsInstance(mono_result, dict)
+
+        for key in output_keys:
+            self.assertIn(key, mono_result)
+
+        multi_result = table_querier(valid_inputs)
+        self.assertIsInstance(multi_result, list)
+        for result in multi_result:
+            self.assertIsInstance(result, (list, dict))
+
+        for result in multi_result:
+            if isinstance(result, list):
+                for _result in result:
+                    for key in output_keys:
+                        self.assertIn(key, _result)
+            else:
+                for key in output_keys:
+                    self.assertIn(key, result)
+        for bad_input in invalid_inputs:
+            self.assertRaises(ValueError, table_querier, bad_input)
+        self.assertRaises(ValueError, table_querier, invalid_inputs)
+
+    def test_aggregation(self):
+        table_querier = pipeline(
+            "table-question-answering",
+            model="lysandre/tiny-tapas-random-wtq",
+            tokenizer="lysandre/tiny-tapas-random-wtq",
+        )
+        self.assertIsInstance(table_querier.model.config.aggregation_labels, dict)
+        self.assertIsInstance(table_querier.model.config.no_aggregation_label_index, int)
+
+        mono_result = table_querier(self.valid_inputs[0])
+        multi_result = table_querier(self.valid_inputs)
+
+        self.assertIn("aggregator", mono_result)
+
+        for result in multi_result:
+            if isinstance(result, list):
+                for _result in result:
+                    self.assertIn("aggregator", _result)
+            else:
+                self.assertIn("aggregator", result)
+
+    def test_aggregation_with_sequential(self):
+        table_querier = pipeline(
+            "table-question-answering",
+            model="lysandre/tiny-tapas-random-wtq",
+            tokenizer="lysandre/tiny-tapas-random-wtq",
+        )
+        self.assertIsInstance(table_querier.model.config.aggregation_labels, dict)
+        self.assertIsInstance(table_querier.model.config.no_aggregation_label_index, int)
+
+        with self.assertRaises(ValueError):
+            table_querier(
+                {
+                    "table": {},
+                    "query": "how many movies has george clooney played in?",
+                }
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                {
+                    "query": "how many movies has george clooney played in?",
+                }
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                {
+                    "table": {
+                        "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                        "Stars": ["36542", "4512", "3934"],
+                        "Contributors": ["651", "77", "34"],
+                        "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                    },
+                    "query": "",
+                }
+            )
+        with self.assertRaises(ValueError):
+            table_querier(
+                {
+                    "table": {
+                        "Repository": ["Transformers", "Datasets", "Tokenizers"],
+                        "Stars": ["36542", "4512", "3934"],
+                        "Contributors": ["651", "77", "34"],
+                        "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+                    },
+                }
+            )
+
+    def test_empty_errors(self):
+        table_querier = pipeline(
+            "table-question-answering",
+            model="lysandre/tiny-tapas-random-wtq",
+            tokenizer="lysandre/tiny-tapas-random-wtq",
+        )
+        mono_result = table_querier(self.valid_inputs[0], sequential=True)
+        multi_result = table_querier(self.valid_inputs, sequential=True)
+
+        self.assertIn("aggregator", mono_result)
+
+        for result in multi_result:
+            if isinstance(result, list):
+                for _result in result:
+                    self.assertIn("aggregator", _result)
+            else:
+                self.assertIn("aggregator", result)
+
+    def test_sequential(self):
+        table_querier = pipeline(
+            "table-question-answering",
+            model="lysandre/tiny-tapas-random-sqa",
+            tokenizer="lysandre/tiny-tapas-random-sqa",
+        )
+        sequential_mono_result_0 = table_querier(self.valid_inputs[0], sequential=True)
+        sequential_mono_result_1 = table_querier(self.valid_inputs[1], sequential=True)
+        sequential_multi_result = table_querier(self.valid_inputs, sequential=True)
+        mono_result_0 = table_querier(self.valid_inputs[0])
+        mono_result_1 = table_querier(self.valid_inputs[1])
+        multi_result = table_querier(self.valid_inputs)
+
+        # First valid input has a single question, the dict should be equal
+        self.assertDictEqual(sequential_mono_result_0, mono_result_0)
+
+        # Second valid input has several questions, the questions following the first one should not be equal
+        self.assertNotEqual(sequential_mono_result_1, mono_result_1)
+
+        # Assert that we get the same results when passing in several sequences.
+        for index, (sequential_multi, multi) in enumerate(zip(sequential_multi_result, multi_result)):
+            if index == 0:
+                self.assertDictEqual(sequential_multi, multi)
+            else:
+                self.assertNotEqual(sequential_multi, multi)
+
+    @slow
+    def test_integration_wtq(self):
+        table_querier = pipeline("table-question-answering")
+
+        data = {
+            "Repository": ["Transformers", "Datasets", "Tokenizers"],
+            "Stars": ["36542", "4512", "3934"],
+            "Contributors": ["651", "77", "34"],
+            "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+        }
+        queries = [
+            "What repository has the largest number of stars?",
+            "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
+            "What is the number of repositories?",
+            "What is the average number of stars?",
+            "What is the total amount of stars?",
+        ]
+
+        results = table_querier(data, queries)
+
+        expected_results = [
+            {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
+            {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
+            {
+                "answer": "COUNT > Transformers, Datasets, Tokenizers",
+                "coordinates": [(0, 0), (1, 0), (2, 0)],
+                "cells": ["Transformers", "Datasets", "Tokenizers"],
+                "aggregator": "COUNT",
+            },
+            {
+                "answer": "AVERAGE > 36542, 4512, 3934",
+                "coordinates": [(0, 1), (1, 1), (2, 1)],
+                "cells": ["36542", "4512", "3934"],
+                "aggregator": "AVERAGE",
+            },
+            {
+                "answer": "SUM > 36542, 4512, 3934",
+                "coordinates": [(0, 1), (1, 1), (2, 1)],
+                "cells": ["36542", "4512", "3934"],
+                "aggregator": "SUM",
+            },
+        ]
+        self.assertListEqual(results, expected_results)
+
+    @slow
+    def test_integration_sqa(self):
+        table_querier = pipeline(
+            "table-question-answering",
+            model="google/tapas-base-finetuned-sqa",
+            tokenizer="google/tapas-base-finetuned-sqa",
+        )
+        data = {
+            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+            "Age": ["56", "45", "59"],
+            "Number of movies": ["87", "53", "69"],
+            "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"]
+        results = table_querier(data, queries, sequential=True)
+
+        expected_results = [
+            {"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]},
+            {"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]},
+            {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
+        ]
+        self.assertListEqual(results, expected_results)
diff --git a/test_pipelines_text2text_generation.py b/test_pipelines_text2text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d1b21b6a2be21f1c3f03926948590f6c3c7e464
--- /dev/null
+++ b/test_pipelines_text2text_generation.py
@@ -0,0 +1,25 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+class Text2TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "text2text-generation"
+    small_models = ["patrickvonplaten/t5-tiny-random"]  # Default model - Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+    invalid_inputs = [4, "<mask>"]
+    mandatory_keys = ["generated_text"]
diff --git a/test_pipelines_text_classification.py b/test_pipelines_text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db8a24116c5edc8173022fa60e7b59d0ec86ab6
--- /dev/null
+++ b/test_pipelines_text_classification.py
@@ -0,0 +1,26 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+class TextClassificationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "sentiment-analysis"
+    small_models = [
+        "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
+    ]  # Default model - Models tested without the @slow decorator
+    large_models = [None]  # Models tested with the @slow decorator
+    mandatory_keys = {"label", "score"}  # Keys which should be in the output
diff --git a/test_pipelines_text_generation.py b/test_pipelines_text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a2d77b55e573fd4d713f46166bf57a499822991
--- /dev/null
+++ b/test_pipelines_text_generation.py
@@ -0,0 +1,62 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import pipeline
+from transformers.testing_utils import require_torch
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+class TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "text-generation"
+    pipeline_running_kwargs = {"prefix": "This is "}
+    small_models = ["sshleifer/tiny-ctrl"]  # Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+
+    def test_simple_generation(self):
+        text_generator = pipeline(task="text-generation", model=self.small_models[0])
+        # text-generation is non-deterministic by nature, we can't fully test the output
+
+        outputs = text_generator("This is a test")
+
+        self.assertEqual(len(outputs), 1)
+        self.assertEqual(list(outputs[0].keys()), ["generated_text"])
+        self.assertEqual(type(outputs[0]["generated_text"]), str)
+
+        outputs = text_generator(["This is a test", "This is a second test"])
+        self.assertEqual(len(outputs[0]), 1)
+        self.assertEqual(list(outputs[0][0].keys()), ["generated_text"])
+        self.assertEqual(type(outputs[0][0]["generated_text"]), str)
+        self.assertEqual(list(outputs[1][0].keys()), ["generated_text"])
+        self.assertEqual(type(outputs[1][0]["generated_text"]), str)
+
+    @require_torch
+    def test_generation_output_style(self):
+        text_generator = pipeline(task="text-generation", model=self.small_models[0])
+        # text-generation is non-deterministic by nature, we can't fully test the output
+
+        outputs = text_generator("This is a test")
+        self.assertIn("This is a test", outputs[0]["generated_text"])
+
+        outputs = text_generator("This is a test", return_full_text=False)
+        self.assertNotIn("This is a test", outputs[0]["generated_text"])
+
+        text_generator = pipeline(task="text-generation", model=self.small_models[0], return_full_text=False)
+        outputs = text_generator("This is a test")
+        self.assertNotIn("This is a test", outputs[0]["generated_text"])
+
+        outputs = text_generator("This is a test", return_full_text=True)
+        self.assertIn("This is a test", outputs[0]["generated_text"])
diff --git a/test_pipelines_token_classification.py b/test_pipelines_token_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..4197dae5da92f5c010af6c8b2432bd6a7b3133a5
--- /dev/null
+++ b/test_pipelines_token_classification.py
@@ -0,0 +1,551 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+from transformers.pipelines import AggregationStrategy, Pipeline, TokenClassificationArgumentHandler
+from transformers.testing_utils import nested_simplify, require_tf, require_torch, slow
+
+from .test_pipelines_common import CustomInputPipelineCommonMixin
+
+
+VALID_INPUTS = ["A simple string", ["list of strings", "A simple string that is quite a bit longer"]]
+
+
+class TokenClassificationPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "ner"
+    small_models = [
+        "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
+    ]  # Default model - Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+
+    def _test_pipeline(self, token_classifier: Pipeline):
+        output_keys = {"entity", "word", "score", "start", "end", "index"}
+        if token_classifier.aggregation_strategy != AggregationStrategy.NONE:
+            output_keys = {"entity_group", "word", "score", "start", "end"}
+
+        self.assertIsNotNone(token_classifier)
+
+        mono_result = token_classifier(VALID_INPUTS[0])
+        self.assertIsInstance(mono_result, list)
+        self.assertIsInstance(mono_result[0], (dict, list))
+
+        if isinstance(mono_result[0], list):
+            mono_result = mono_result[0]
+
+        for key in output_keys:
+            self.assertIn(key, mono_result[0])
+
+        multi_result = [token_classifier(input) for input in VALID_INPUTS]
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], (dict, list))
+
+        if isinstance(multi_result[0], list):
+            multi_result = multi_result[0]
+
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+
+    @require_torch
+    @slow
+    def test_spanish_bert(self):
+        # https://github.com/huggingface/transformers/pull/4987
+        NER_MODEL = "mrm8488/bert-spanish-cased-finetuned-ner"
+        model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
+        tokenizer = AutoTokenizer.from_pretrained(NER_MODEL, use_fast=True)
+        sentence = """Consuelo Araújo Noguera, ministra de cultura del presidente Andrés Pastrana (1998.2002) fue asesinada por las Farc luego de haber permanecido secuestrada por algunos meses."""
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer)
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity": "B-PER", "score": 0.999, "word": "Cons", "start": 0, "end": 4, "index": 1},
+                {"entity": "B-PER", "score": 0.803, "word": "##uelo", "start": 4, "end": 8, "index": 2},
+                {"entity": "I-PER", "score": 0.999, "word": "Ara", "start": 9, "end": 12, "index": 3},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity_group": "PER", "score": 0.999, "word": "Cons", "start": 0, "end": 4},
+                {"entity_group": "PER", "score": 0.966, "word": "##uelo Araújo Noguera", "start": 4, "end": 23},
+                {"entity_group": "PER", "score": 1.0, "word": "Andrés Pastrana", "start": 60, "end": 75},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity_group": "PER", "score": 0.999, "word": "Consuelo Araújo Noguera", "start": 0, "end": 23},
+                {"entity_group": "PER", "score": 1.0, "word": "Andrés Pastrana", "start": 60, "end": 75},
+                {"entity_group": "ORG", "score": 0.999, "word": "Farc", "start": 110, "end": 114},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity_group": "PER", "score": 0.999, "word": "Consuelo Araújo Noguera", "start": 0, "end": 23},
+                {"entity_group": "PER", "score": 1.0, "word": "Andrés Pastrana", "start": 60, "end": 75},
+                {"entity_group": "ORG", "score": 0.999, "word": "Farc", "start": 110, "end": 114},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity_group": "PER", "score": 0.966, "word": "Consuelo Araújo Noguera", "start": 0, "end": 23},
+                {"entity_group": "PER", "score": 1.0, "word": "Andrés Pastrana", "start": 60, "end": 75},
+                {"entity_group": "ORG", "score": 0.542, "word": "Farc", "start": 110, "end": 114},
+            ],
+        )
+
+    @require_torch
+    @slow
+    def test_dbmdz_english(self):
+        # Other sentence
+        NER_MODEL = "dbmdz/bert-large-cased-finetuned-conll03-english"
+        model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
+        tokenizer = AutoTokenizer.from_pretrained(NER_MODEL, use_fast=True)
+        sentence = """Enzo works at the the UN"""
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer)
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity": "I-PER", "score": 0.997, "word": "En", "start": 0, "end": 2, "index": 1},
+                {"entity": "I-PER", "score": 0.996, "word": "##zo", "start": 2, "end": 4, "index": 2},
+                {"entity": "I-ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24, "index": 7},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity_group": "PER", "score": 0.996, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output[:3]),
+            [
+                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+            ],
+        )
+
+        token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")
+        output = token_classifier(sentence)
+        self.assertEqual(
+            nested_simplify(output),
+            [
+                {"entity_group": "PER", "score": 0.996, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 22, "end": 24},
+            ],
+        )
+
+    @require_torch
+    def test_aggregation_strategy(self):
+        model_name = self.small_models[0]
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt")
+        # Just to understand scores indexes in this test
+        self.assertEqual(
+            token_classifier.model.config.id2label,
+            {0: "O", 1: "B-MISC", 2: "I-MISC", 3: "B-PER", 4: "I-PER", 5: "B-ORG", 6: "I-ORG", 7: "B-LOC", 8: "I-LOC"},
+        )
+        example = [
+            {
+                # fmt : off
+                "scores": np.array([0, 0, 0, 0, 0.9968166351318359, 0, 0, 0]),
+                "index": 1,
+                "is_subword": False,
+                "word": "En",
+                "start": 0,
+                "end": 2,
+            },
+            {
+                # fmt : off
+                "scores": np.array([0, 0, 0, 0, 0.9957635998725891, 0, 0, 0]),
+                "index": 2,
+                "is_subword": True,
+                "word": "##zo",
+                "start": 2,
+                "end": 4,
+            },
+            {
+                # fmt: off
+                "scores": np.array([0, 0, 0, 0, 0, 0.9986497163772583, 0, 0, ]),
+                # fmt: on
+                "index": 7,
+                "word": "UN",
+                "is_subword": False,
+                "start": 11,
+                "end": 13,
+            },
+        ]
+        self.assertEqual(
+            nested_simplify(token_classifier.aggregate(example, AggregationStrategy.NONE)),
+            [
+                {"end": 2, "entity": "I-PER", "score": 0.997, "start": 0, "word": "En", "index": 1},
+                {"end": 4, "entity": "I-PER", "score": 0.996, "start": 2, "word": "##zo", "index": 2},
+                {"end": 13, "entity": "B-ORG", "score": 0.999, "start": 11, "word": "UN", "index": 7},
+            ],
+        )
+        self.assertEqual(
+            nested_simplify(token_classifier.aggregate(example, AggregationStrategy.SIMPLE)),
+            [
+                {"entity_group": "PER", "score": 0.996, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 11, "end": 13},
+            ],
+        )
+        self.assertEqual(
+            nested_simplify(token_classifier.aggregate(example, AggregationStrategy.FIRST)),
+            [
+                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 11, "end": 13},
+            ],
+        )
+        self.assertEqual(
+            nested_simplify(token_classifier.aggregate(example, AggregationStrategy.MAX)),
+            [
+                {"entity_group": "PER", "score": 0.997, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 11, "end": 13},
+            ],
+        )
+        self.assertEqual(
+            nested_simplify(token_classifier.aggregate(example, AggregationStrategy.AVERAGE)),
+            [
+                {"entity_group": "PER", "score": 0.996, "word": "Enzo", "start": 0, "end": 4},
+                {"entity_group": "ORG", "score": 0.999, "word": "UN", "start": 11, "end": 13},
+            ],
+        )
+
+    @require_torch
+    def test_aggregation_strategy_example2(self):
+        model_name = self.small_models[0]
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt")
+        # Just to understand scores indexes in this test
+        self.assertEqual(
+            token_classifier.model.config.id2label,
+            {0: "O", 1: "B-MISC", 2: "I-MISC", 3: "B-PER", 4: "I-PER", 5: "B-ORG", 6: "I-ORG", 7: "B-LOC", 8: "I-LOC"},
+        )
+        example = [
+            {
+                # Necessary for AVERAGE
+                "scores": np.array([0, 0.55, 0, 0.45, 0, 0, 0, 0, 0, 0]),
+                "is_subword": False,
+                "index": 1,
+                "word": "Ra",
+                "start": 0,
+                "end": 2,
+            },
+            {
+                "scores": np.array([0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0]),
+                "is_subword": True,
+                "word": "##ma",
+                "start": 2,
+                "end": 4,
+                "index": 2,
+            },
+            {
+                # 4th score will have the higher average
+                # 4th score is B-PER for this model
+                # It's does not correspond to any of the subtokens.
+                "scores": np.array([0, 0, 0, 0.4, 0, 0, 0.6, 0, 0, 0]),
+                "is_subword": True,
+                "word": "##zotti",
+                "start": 11,
+                "end": 13,
+                "index": 3,
+            },
+        ]
+        self.assertEqual(
+            token_classifier.aggregate(example, AggregationStrategy.NONE),
+            [
+                {"end": 2, "entity": "B-MISC", "score": 0.55, "start": 0, "word": "Ra", "index": 1},
+                {"end": 4, "entity": "B-LOC", "score": 0.8, "start": 2, "word": "##ma", "index": 2},
+                {"end": 13, "entity": "I-ORG", "score": 0.6, "start": 11, "word": "##zotti", "index": 3},
+            ],
+        )
+
+        self.assertEqual(
+            token_classifier.aggregate(example, AggregationStrategy.FIRST),
+            [{"entity_group": "MISC", "score": 0.55, "word": "Ramazotti", "start": 0, "end": 13}],
+        )
+        self.assertEqual(
+            token_classifier.aggregate(example, AggregationStrategy.MAX),
+            [{"entity_group": "LOC", "score": 0.8, "word": "Ramazotti", "start": 0, "end": 13}],
+        )
+        self.assertEqual(
+            nested_simplify(token_classifier.aggregate(example, AggregationStrategy.AVERAGE)),
+            [{"entity_group": "PER", "score": 0.35, "word": "Ramazotti", "start": 0, "end": 13}],
+        )
+
+    @require_torch
+    def test_gather_pre_entities(self):
+
+        model_name = self.small_models[0]
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt")
+
+        sentence = "Hello there"
+
+        tokens = tokenizer(
+            sentence,
+            return_attention_mask=False,
+            return_tensors="pt",
+            truncation=True,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=True,
+        )
+        offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
+        special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
+        input_ids = tokens["input_ids"].numpy()[0]
+        # First element in [CLS]
+        scores = np.array([[1, 0, 0], [0.1, 0.3, 0.6], [0.8, 0.1, 0.1]])
+
+        pre_entities = token_classifier.gather_pre_entities(
+            sentence, input_ids, scores, offset_mapping, special_tokens_mask
+        )
+        self.assertEqual(
+            nested_simplify(pre_entities),
+            [
+                {"word": "Hello", "scores": [0.1, 0.3, 0.6], "start": 0, "end": 5, "is_subword": False, "index": 1},
+                {
+                    "word": "there",
+                    "scores": [0.8, 0.1, 0.1],
+                    "index": 2,
+                    "start": 6,
+                    "end": 11,
+                    "is_subword": False,
+                },
+            ],
+        )
+
+    @require_tf
+    def test_tf_only(self):
+        model_name = "Narsil/small"  # This model only has a TensorFlow version
+        # We test that if we don't specificy framework='tf', it gets detected automatically
+        token_classifier = pipeline(task="ner", model=model_name)
+        self._test_pipeline(token_classifier)
+
+    @require_tf
+    def test_tf_defaults(self):
+        for model_name in self.small_models:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="tf")
+        self._test_pipeline(token_classifier)
+
+    @require_tf
+    def test_tf_small_ignore_subwords_available_for_fast_tokenizers(self):
+        for model_name in self.small_models:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            token_classifier = pipeline(
+                task="ner",
+                model=model_name,
+                tokenizer=tokenizer,
+                framework="tf",
+                aggregation_strategy=AggregationStrategy.FIRST,
+            )
+            self._test_pipeline(token_classifier)
+
+        for model_name in self.small_models:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            token_classifier = pipeline(
+                task="ner",
+                model=model_name,
+                tokenizer=tokenizer,
+                framework="tf",
+                aggregation_strategy=AggregationStrategy.SIMPLE,
+            )
+            self._test_pipeline(token_classifier)
+
+    @require_torch
+    def test_pt_ignore_subwords_slow_tokenizer_raises(self):
+        model_name = self.small_models[0]
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+
+        with self.assertRaises(ValueError):
+            pipeline(task="ner", model=model_name, tokenizer=tokenizer, aggregation_strategy=AggregationStrategy.FIRST)
+        with self.assertRaises(ValueError):
+            pipeline(
+                task="ner", model=model_name, tokenizer=tokenizer, aggregation_strategy=AggregationStrategy.AVERAGE
+            )
+        with self.assertRaises(ValueError):
+            pipeline(task="ner", model=model_name, tokenizer=tokenizer, aggregation_strategy=AggregationStrategy.MAX)
+
+    @require_torch
+    def test_pt_defaults_slow_tokenizer(self):
+        for model_name in self.small_models:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
+            self._test_pipeline(token_classifier)
+
+    @require_torch
+    def test_pt_defaults(self):
+        for model_name in self.small_models:
+            token_classifier = pipeline(task="ner", model=model_name)
+            self._test_pipeline(token_classifier)
+
+    @slow
+    @require_torch
+    def test_warnings(self):
+        with self.assertWarns(UserWarning):
+            token_classifier = pipeline(task="ner", model=self.small_models[0], grouped_entities=True)
+        self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.SIMPLE)
+        with self.assertWarns(UserWarning):
+            token_classifier = pipeline(
+                task="ner", model=self.small_models[0], grouped_entities=True, ignore_subwords=True
+            )
+        self.assertEqual(token_classifier.aggregation_strategy, AggregationStrategy.FIRST)
+
+    @slow
+    @require_torch
+    def test_simple(self):
+        token_classifier = pipeline(task="ner", model="dslim/bert-base-NER", grouped_entities=True)
+        sentence = "Hello Sarah Jessica Parker who Jessica lives in New York"
+        sentence2 = "This is a simple test"
+        output = token_classifier(sentence)
+
+        output_ = nested_simplify(output)
+
+        self.assertEqual(
+            output_,
+            [
+                {
+                    "entity_group": "PER",
+                    "score": 0.996,
+                    "word": "Sarah Jessica Parker",
+                    "start": 6,
+                    "end": 26,
+                },
+                {"entity_group": "PER", "score": 0.977, "word": "Jessica", "start": 31, "end": 38},
+                {"entity_group": "LOC", "score": 0.999, "word": "New York", "start": 48, "end": 56},
+            ],
+        )
+
+        output = token_classifier([sentence, sentence2])
+        output_ = nested_simplify(output)
+
+        self.assertEqual(
+            output_,
+            [
+                [
+                    {"entity_group": "PER", "score": 0.996, "word": "Sarah Jessica Parker", "start": 6, "end": 26},
+                    {"entity_group": "PER", "score": 0.977, "word": "Jessica", "start": 31, "end": 38},
+                    {"entity_group": "LOC", "score": 0.999, "word": "New York", "start": 48, "end": 56},
+                ],
+                [],
+            ],
+        )
+
+    @require_torch
+    def test_pt_small_ignore_subwords_available_for_fast_tokenizers(self):
+        for model_name in self.small_models:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            token_classifier = pipeline(
+                task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=True
+            )
+            self._test_pipeline(token_classifier)
+
+        for model_name in self.small_models:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+            token_classifier = pipeline(
+                task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=False
+            )
+            self._test_pipeline(token_classifier)
+
+
+class TokenClassificationArgumentHandlerTestCase(unittest.TestCase):
+    def setUp(self):
+        self.args_parser = TokenClassificationArgumentHandler()
+
+    def test_simple(self):
+        string = "This is a simple input"
+
+        inputs, offset_mapping = self.args_parser(string)
+        self.assertEqual(inputs, [string])
+        self.assertEqual(offset_mapping, None)
+
+        inputs, offset_mapping = self.args_parser([string, string])
+        self.assertEqual(inputs, [string, string])
+        self.assertEqual(offset_mapping, None)
+
+        inputs, offset_mapping = self.args_parser(string, offset_mapping=[(0, 1), (1, 2)])
+        self.assertEqual(inputs, [string])
+        self.assertEqual(offset_mapping, [[(0, 1), (1, 2)]])
+
+        inputs, offset_mapping = self.args_parser(
+            [string, string], offset_mapping=[[(0, 1), (1, 2)], [(0, 2), (2, 3)]]
+        )
+        self.assertEqual(inputs, [string, string])
+        self.assertEqual(offset_mapping, [[(0, 1), (1, 2)], [(0, 2), (2, 3)]])
+
+    def test_errors(self):
+        string = "This is a simple input"
+
+        # 2 sentences, 1 offset_mapping, args
+        with self.assertRaises(TypeError):
+            self.args_parser(string, string, offset_mapping=[[(0, 1), (1, 2)]])
+
+        # 2 sentences, 1 offset_mapping, args
+        with self.assertRaises(TypeError):
+            self.args_parser(string, string, offset_mapping=[(0, 1), (1, 2)])
+
+        # 2 sentences, 1 offset_mapping, input_list
+        with self.assertRaises(ValueError):
+            self.args_parser([string, string], offset_mapping=[[(0, 1), (1, 2)]])
+
+        # 2 sentences, 1 offset_mapping, input_list
+        with self.assertRaises(ValueError):
+            self.args_parser([string, string], offset_mapping=[(0, 1), (1, 2)])
+
+        # 1 sentences, 2 offset_mapping
+        with self.assertRaises(ValueError):
+            self.args_parser(string, offset_mapping=[[(0, 1), (1, 2)], [(0, 2), (2, 3)]])
+
+        # 0 sentences, 1 offset_mapping
+        with self.assertRaises(TypeError):
+            self.args_parser(offset_mapping=[[(0, 1), (1, 2)]])
diff --git a/test_pipelines_translation.py b/test_pipelines_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..222f7b4ed5895333abfa22f48df94a3b78f6aa5e
--- /dev/null
+++ b/test_pipelines_translation.py
@@ -0,0 +1,100 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+
+from transformers import pipeline
+from transformers.testing_utils import is_pipeline_test, is_torch_available, require_torch, slow
+
+from .test_pipelines_common import MonoInputPipelineCommonMixin
+
+
+if is_torch_available():
+    from transformers.models.mbart import MBart50TokenizerFast, MBartForConditionalGeneration
+
+
+class TranslationEnToDePipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "translation_en_to_de"
+    small_models = ["patrickvonplaten/t5-tiny-random"]  # Default model - Models tested without the @slow decorator
+    large_models = [None]  # Models tested with the @slow decorator
+    invalid_inputs = [4, "<mask>"]
+    mandatory_keys = ["translation_text"]
+
+
+class TranslationEnToRoPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "translation_en_to_ro"
+    small_models = ["patrickvonplaten/t5-tiny-random"]  # Default model - Models tested without the @slow decorator
+    large_models = [None]  # Models tested with the @slow decorator
+    invalid_inputs = [4, "<mask>"]
+    mandatory_keys = ["translation_text"]
+
+
+@is_pipeline_test
+class TranslationNewFormatPipelineTests(unittest.TestCase):
+    @require_torch
+    @slow
+    def test_default_translations(self):
+        # We don't provide a default for this pair
+        with self.assertRaises(ValueError):
+            pipeline(task="translation_cn_to_ar")
+
+        # but we do for this one
+        translator = pipeline(task="translation_en_to_de")
+        self.assertEquals(translator.src_lang, "en")
+        self.assertEquals(translator.tgt_lang, "de")
+
+    @require_torch
+    @slow
+    def test_multilingual_translation(self):
+        model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+        tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+
+        translator = pipeline(task="translation", model=model, tokenizer=tokenizer)
+        # Missing src_lang, tgt_lang
+        with self.assertRaises(ValueError):
+            translator("This is a test")
+
+        outputs = translator("This is a test", src_lang="en_XX", tgt_lang="ar_AR")
+        self.assertEqual(outputs, [{"translation_text": "هذا إختبار"}])
+
+        outputs = translator("This is a test", src_lang="en_XX", tgt_lang="hi_IN")
+        self.assertEqual(outputs, [{"translation_text": "यह एक परीक्षण है"}])
+
+        # src_lang, tgt_lang can be defined at pipeline call time
+        translator = pipeline(task="translation", model=model, tokenizer=tokenizer, src_lang="en_XX", tgt_lang="ar_AR")
+        outputs = translator("This is a test")
+        self.assertEqual(outputs, [{"translation_text": "هذا إختبار"}])
+
+    @require_torch
+    def test_translation_on_odd_language(self):
+        model = "patrickvonplaten/t5-tiny-random"
+        translator = pipeline(task="translation_cn_to_ar", model=model)
+        self.assertEquals(translator.src_lang, "cn")
+        self.assertEquals(translator.tgt_lang, "ar")
+
+    @require_torch
+    def test_translation_default_language_selection(self):
+        model = "patrickvonplaten/t5-tiny-random"
+        with pytest.warns(UserWarning, match=r".*translation_en_to_de.*"):
+            translator = pipeline(task="translation", model=model)
+        self.assertEqual(translator.task, "translation_en_to_de")
+        self.assertEquals(translator.src_lang, "en")
+        self.assertEquals(translator.tgt_lang, "de")
+
+    @require_torch
+    def test_translation_with_no_language_no_model_fails(self):
+        with self.assertRaises(ValueError):
+            pipeline(task="translation")
diff --git a/test_pipelines_zero_shot.py b/test_pipelines_zero_shot.py
new file mode 100644
index 0000000000000000000000000000000000000000..20f2666c8135057285edb7827785c1c00151ea2d
--- /dev/null
+++ b/test_pipelines_zero_shot.py
@@ -0,0 +1,167 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from copy import deepcopy
+
+from transformers.pipelines import Pipeline
+
+from .test_pipelines_common import CustomInputPipelineCommonMixin
+
+
+class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
+    pipeline_task = "zero-shot-classification"
+    small_models = [
+        "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
+    ]  # Models tested without the @slow decorator
+    large_models = ["roberta-large-mnli"]  # Models tested with the @slow decorator
+    valid_inputs = [
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
+        {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
+        {
+            "sequences": "Who are you voting for in 2020?",
+            "candidate_labels": "politics",
+            "hypothesis_template": "This text is about {}",
+        },
+    ]
+
+    def _test_scores_sum_to_one(self, result):
+        sum = 0.0
+        for score in result["scores"]:
+            sum += score
+        self.assertAlmostEqual(sum, 1.0, places=5)
+
+    def _test_entailment_id(self, zero_shot_classifier: Pipeline):
+        config = zero_shot_classifier.model.config
+        original_config = deepcopy(config)
+
+        config.label2id = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}
+        self.assertEqual(zero_shot_classifier.entailment_id, -1)
+
+        config.label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
+        self.assertEqual(zero_shot_classifier.entailment_id, 0)
+
+        config.label2id = {"ENTAIL": 0, "NON-ENTAIL": 1}
+        self.assertEqual(zero_shot_classifier.entailment_id, 0)
+
+        config.label2id = {"ENTAIL": 2, "NEUTRAL": 1, "CONTR": 0}
+        self.assertEqual(zero_shot_classifier.entailment_id, 2)
+
+        zero_shot_classifier.model.config = original_config
+
+    def _test_pipeline(self, zero_shot_classifier: Pipeline):
+        output_keys = {"sequence", "labels", "scores"}
+        valid_mono_inputs = [
+            {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
+            {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
+            {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
+            {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
+            {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
+            {
+                "sequences": "Who are you voting for in 2020?",
+                "candidate_labels": "politics",
+                "hypothesis_template": "This text is about {}",
+            },
+        ]
+        valid_multi_input = {
+            "sequences": ["Who are you voting for in 2020?", "What is the capital of Spain?"],
+            "candidate_labels": "politics",
+        }
+        invalid_inputs = [
+            {"sequences": None, "candidate_labels": "politics"},
+            {"sequences": "", "candidate_labels": "politics"},
+            {"sequences": "Who are you voting for in 2020?", "candidate_labels": None},
+            {"sequences": "Who are you voting for in 2020?", "candidate_labels": ""},
+            {
+                "sequences": "Who are you voting for in 2020?",
+                "candidate_labels": "politics",
+                "hypothesis_template": None,
+            },
+            {
+                "sequences": "Who are you voting for in 2020?",
+                "candidate_labels": "politics",
+                "hypothesis_template": "",
+            },
+            {
+                "sequences": "Who are you voting for in 2020?",
+                "candidate_labels": "politics",
+                "hypothesis_template": "Template without formatting syntax.",
+            },
+        ]
+        self.assertIsNotNone(zero_shot_classifier)
+
+        self._test_entailment_id(zero_shot_classifier)
+
+        for mono_input in valid_mono_inputs:
+            mono_result = zero_shot_classifier(**mono_input)
+            self.assertIsInstance(mono_result, dict)
+            if len(mono_result["labels"]) > 1:
+                self._test_scores_sum_to_one(mono_result)
+
+            for key in output_keys:
+                self.assertIn(key, mono_result)
+
+        multi_result = zero_shot_classifier(**valid_multi_input)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], dict)
+        self.assertEqual(len(multi_result), len(valid_multi_input["sequences"]))
+
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+
+            if len(result["labels"]) > 1:
+                self._test_scores_sum_to_one(result)
+
+        for bad_input in invalid_inputs:
+            self.assertRaises(Exception, zero_shot_classifier, **bad_input)
+
+        if zero_shot_classifier.model.name_or_path in self.large_models:
+            # We also check the outputs for the large models
+            inputs = [
+                {
+                    "sequences": "Who are you voting for in 2020?",
+                    "candidate_labels": ["politics", "public health", "science"],
+                },
+                {
+                    "sequences": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
+                    "candidate_labels": ["machine learning", "statistics", "translation", "vision"],
+                    "multi_label": True,
+                },
+            ]
+
+            expected_outputs = [
+                {
+                    "sequence": "Who are you voting for in 2020?",
+                    "labels": ["politics", "public health", "science"],
+                    "scores": [0.975, 0.015, 0.008],
+                },
+                {
+                    "sequence": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
+                    "labels": ["translation", "machine learning", "vision", "statistics"],
+                    "scores": [0.817, 0.712, 0.018, 0.017],
+                },
+            ]
+
+            for input, expected_output in zip(inputs, expected_outputs):
+                output = zero_shot_classifier(**input)
+                for key in output:
+                    if key == "scores":
+                        for output_score, expected_score in zip(output[key], expected_output[key]):
+                            self.assertAlmostEqual(output_score, expected_score, places=2)
+                    else:
+                        self.assertEqual(output[key], expected_output[key])
diff --git a/test_processor_clip.py b/test_processor_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d7a73e537b67046071594a12f401e57a1c2753
--- /dev/null
+++ b/test_processor_clip.py
@@ -0,0 +1,177 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import pytest
+
+from transformers import CLIPTokenizer
+from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPFeatureExtractor, CLIPProcessor
+
+
+@require_vision
+class CLIPProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|endoftext|>"]
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+        feature_extractor_map = {
+            "do_resize": True,
+            "size": 20,
+            "do_center_crop": True,
+            "crop_size": 18,
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        self.feature_extractor_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(self.feature_extractor_file, "w", encoding="utf-8") as fp:
+            json.dump(feature_extractor_map, fp)
+
+    def get_tokenizer(self, **kwargs):
+        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return CLIPFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+        """
+
+        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+        return image_inputs
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = CLIPProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLIPTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = CLIPProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = CLIPProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLIPTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, CLIPFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        image_input = self.prepare_image_inputs()
+
+        input_feat_extract = feature_extractor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "lower newer"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_processor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
diff --git a/test_processor_speech_to_text.py b/test_processor_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a7a7446152d46027c520299966e630bc6d5540
--- /dev/null
+++ b/test_processor_speech_to_text.py
@@ -0,0 +1,148 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+from shutil import copyfile
+
+from transformers import Speech2TextTokenizer, is_speech_available
+from transformers.file_utils import FEATURE_EXTRACTOR_NAME
+from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json
+from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
+
+from .test_feature_extraction_speech_to_text import floats_list
+
+
+if is_speech_available():
+    from transformers import Speech2TextFeatureExtractor, Speech2TextProcessor
+
+
+SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_torch
+@require_torchaudio
+@require_sentencepiece
+class Speech2TextProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        vocab = ["<s>", "<pad>", "</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        save_dir = Path(self.tmpdirname)
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+
+        tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+        feature_extractor_map = {
+            "feature_size": 24,
+            "num_mel_bins": 24,
+            "padding_value": 0.0,
+            "sampling_rate": 16000,
+            "return_attention_mask": False,
+            "do_normalize": True,
+        }
+        save_json(feature_extractor_map, save_dir / FEATURE_EXTRACTOR_NAME)
+
+    def get_tokenizer(self, **kwargs):
+        return Speech2TextTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return Speech2TextFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = Speech2TextProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = Speech2TextProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, Speech2TextTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, Speech2TextFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = Speech2TextProcessor(
+            tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = Speech2TextProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, Speech2TextTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, Speech2TextFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Speech2TextProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Speech2TextProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        with processor.as_target_processor():
+            encoded_processor = processor(input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Speech2TextProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
diff --git a/test_processor_wav2vec2.py b/test_processor_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d30b0693463408280d4ca687220dbebc8842f72
--- /dev/null
+++ b/test_processor_wav2vec2.py
@@ -0,0 +1,139 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+from transformers.file_utils import FEATURE_EXTRACTOR_NAME
+from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
+from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
+
+from .test_feature_extraction_wav2vec2 import floats_list
+
+
+class Wav2Vec2ProcessorTest(unittest.TestCase):
+    def setUp(self):
+        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        self.add_kwargs_tokens_map = {
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+        }
+        feature_extractor_map = {
+            "feature_size": 1,
+            "padding_value": 0.0,
+            "sampling_rate": 16000,
+            "return_attention_mask": False,
+            "do_normalize": True,
+        }
+
+        self.tmpdirname = tempfile.mkdtemp()
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+
+        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(feature_extractor_map) + "\n")
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.add_kwargs_tokens_map)
+        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = Wav2Vec2Processor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = Wav2Vec2Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = Wav2Vec2Processor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        with processor.as_target_processor():
+            encoded_processor = processor(input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
diff --git a/test_retrieval_rag.py b/test_retrieval_rag.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0b68c4a335764f1731d43288cb46091f583f40f
--- /dev/null
+++ b/test_retrieval_rag.py
@@ -0,0 +1,387 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import pickle
+import shutil
+import tempfile
+from unittest import TestCase
+from unittest.mock import patch
+
+import numpy as np
+from datasets import Dataset
+
+from transformers import is_faiss_available
+from transformers.models.bart.configuration_bart import BartConfig
+from transformers.models.bart.tokenization_bart import BartTokenizer
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
+from transformers.models.dpr.configuration_dpr import DPRConfig
+from transformers.models.dpr.tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
+from transformers.models.rag.configuration_rag import RagConfig
+from transformers.models.rag.retrieval_rag import CustomHFIndex, RagRetriever
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
+from transformers.testing_utils import (
+    require_datasets,
+    require_faiss,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+)
+
+
+if is_faiss_available():
+    import faiss
+
+
+@require_faiss
+@require_datasets
+class RagRetrieverTest(TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        self.retrieval_vector_size = 8
+
+        # DPR tok
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
+        os.makedirs(dpr_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+        # BART tok
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
+        os.makedirs(bart_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
+        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
+
+    def get_dpr_ctx_encoder_tokenizer(self) -> DPRContextEncoderTokenizer:
+        return DPRContextEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
+
+    def get_bart_tokenizer(self) -> BartTokenizer:
+        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def get_dummy_dataset(self):
+        dataset = Dataset.from_dict(
+            {
+                "id": ["0", "1"],
+                "text": ["foo", "bar"],
+                "title": ["Foo", "Bar"],
+                "embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)],
+            }
+        )
+        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
+        return dataset
+
+    def get_dummy_canonical_hf_index_retriever(self):
+        dataset = self.get_dummy_dataset()
+        config = RagConfig(
+            retrieval_vector_size=self.retrieval_vector_size,
+            question_encoder=DPRConfig().to_dict(),
+            generator=BartConfig().to_dict(),
+        )
+        with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
+            mock_load_dataset.return_value = dataset
+            retriever = RagRetriever(
+                config,
+                question_encoder_tokenizer=self.get_dpr_tokenizer(),
+                generator_tokenizer=self.get_bart_tokenizer(),
+            )
+        return retriever
+
+    def get_dummy_custom_hf_index_retriever(self, from_disk: bool):
+        dataset = self.get_dummy_dataset()
+        config = RagConfig(
+            retrieval_vector_size=self.retrieval_vector_size,
+            question_encoder=DPRConfig().to_dict(),
+            generator=BartConfig().to_dict(),
+            index_name="custom",
+        )
+        if from_disk:
+            config.passages_path = os.path.join(self.tmpdirname, "dataset")
+            config.index_path = os.path.join(self.tmpdirname, "index.faiss")
+            dataset.get_index("embeddings").save(os.path.join(self.tmpdirname, "index.faiss"))
+            dataset.drop_index("embeddings")
+            dataset.save_to_disk(os.path.join(self.tmpdirname, "dataset"))
+            del dataset
+            retriever = RagRetriever(
+                config,
+                question_encoder_tokenizer=self.get_dpr_tokenizer(),
+                generator_tokenizer=self.get_bart_tokenizer(),
+            )
+        else:
+            retriever = RagRetriever(
+                config,
+                question_encoder_tokenizer=self.get_dpr_tokenizer(),
+                generator_tokenizer=self.get_bart_tokenizer(),
+                index=CustomHFIndex(config.retrieval_vector_size, dataset),
+            )
+        return retriever
+
+    def get_dummy_legacy_index_retriever(self):
+        dataset = Dataset.from_dict(
+            {
+                "id": ["0", "1"],
+                "text": ["foo", "bar"],
+                "title": ["Foo", "Bar"],
+                "embeddings": [np.ones(self.retrieval_vector_size + 1), 2 * np.ones(self.retrieval_vector_size + 1)],
+            }
+        )
+        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
+
+        index_file_name = os.path.join(self.tmpdirname, "hf_bert_base.hnswSQ8_correct_phi_128.c_index")
+        dataset.save_faiss_index("embeddings", index_file_name + ".index.dpr")
+        pickle.dump(dataset["id"], open(index_file_name + ".index_meta.dpr", "wb"))
+
+        passages_file_name = os.path.join(self.tmpdirname, "psgs_w100.tsv.pkl")
+        passages = {sample["id"]: [sample["text"], sample["title"]] for sample in dataset}
+        pickle.dump(passages, open(passages_file_name, "wb"))
+
+        config = RagConfig(
+            retrieval_vector_size=self.retrieval_vector_size,
+            question_encoder=DPRConfig().to_dict(),
+            generator=BartConfig().to_dict(),
+            index_name="legacy",
+            index_path=self.tmpdirname,
+        )
+        retriever = RagRetriever(
+            config, question_encoder_tokenizer=self.get_dpr_tokenizer(), generator_tokenizer=self.get_bart_tokenizer()
+        )
+        return retriever
+
+    def test_canonical_hf_index_retriever_retrieve(self):
+        n_docs = 1
+        retriever = self.get_dummy_canonical_hf_index_retriever()
+        hidden_states = np.array(
+            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+        )
+        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
+        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
+        self.assertEqual(len(doc_dicts), 2)
+        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
+        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
+        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
+        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
+        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
+
+    def test_canonical_hf_index_retriever_save_and_from_pretrained(self):
+        retriever = self.get_dummy_canonical_hf_index_retriever()
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
+                mock_load_dataset.return_value = self.get_dummy_dataset()
+                retriever.save_pretrained(tmp_dirname)
+                retriever = RagRetriever.from_pretrained(tmp_dirname)
+                self.assertIsInstance(retriever, RagRetriever)
+                hidden_states = np.array(
+                    [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+                )
+                out = retriever.retrieve(hidden_states, n_docs=1)
+                self.assertTrue(out is not None)
+
+    def test_custom_hf_index_retriever_retrieve(self):
+        n_docs = 1
+        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
+        hidden_states = np.array(
+            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+        )
+        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
+        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
+        self.assertEqual(len(doc_dicts), 2)
+        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
+        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
+        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
+        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
+        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
+
+    def test_custom_hf_index_retriever_save_and_from_pretrained(self):
+        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            retriever.save_pretrained(tmp_dirname)
+            retriever = RagRetriever.from_pretrained(tmp_dirname)
+            self.assertIsInstance(retriever, RagRetriever)
+            hidden_states = np.array(
+                [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+            )
+            out = retriever.retrieve(hidden_states, n_docs=1)
+            self.assertTrue(out is not None)
+
+    def test_custom_hf_index_retriever_retrieve_from_disk(self):
+        n_docs = 1
+        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=True)
+        hidden_states = np.array(
+            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+        )
+        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
+        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
+        self.assertEqual(len(doc_dicts), 2)
+        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
+        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
+        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
+        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
+        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
+
+    def test_custom_hf_index_retriever_save_and_from_pretrained_from_disk(self):
+        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=True)
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            retriever.save_pretrained(tmp_dirname)
+            retriever = RagRetriever.from_pretrained(tmp_dirname)
+            self.assertIsInstance(retriever, RagRetriever)
+            hidden_states = np.array(
+                [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+            )
+            out = retriever.retrieve(hidden_states, n_docs=1)
+            self.assertTrue(out is not None)
+
+    def test_legacy_index_retriever_retrieve(self):
+        n_docs = 1
+        retriever = self.get_dummy_legacy_index_retriever()
+        hidden_states = np.array(
+            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+        )
+        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
+        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
+        self.assertEqual(len(doc_dicts), 2)
+        self.assertEqual(sorted(doc_dicts[0]), ["text", "title"])
+        self.assertEqual(len(doc_dicts[0]["text"]), n_docs)
+        self.assertEqual(doc_dicts[0]["text"][0], "bar")  # max inner product is reached with second doc
+        self.assertEqual(doc_dicts[1]["text"][0], "foo")  # max inner product is reached with first doc
+        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
+
+    def test_legacy_hf_index_retriever_save_and_from_pretrained(self):
+        retriever = self.get_dummy_legacy_index_retriever()
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            retriever.save_pretrained(tmp_dirname)
+            retriever = RagRetriever.from_pretrained(tmp_dirname)
+            self.assertIsInstance(retriever, RagRetriever)
+            hidden_states = np.array(
+                [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+            )
+            out = retriever.retrieve(hidden_states, n_docs=1)
+            self.assertTrue(out is not None)
+
+    @require_torch
+    @require_tokenizers
+    @require_sentencepiece
+    def test_hf_index_retriever_call(self):
+        import torch
+
+        n_docs = 1
+        retriever = self.get_dummy_canonical_hf_index_retriever()
+        question_input_ids = [[5, 7], [10, 11]]
+        hidden_states = np.array(
+            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+        )
+        out = retriever(question_input_ids, hidden_states, prefix=retriever.config.generator.prefix, n_docs=n_docs)
+        context_input_ids, context_attention_mask, retrieved_doc_embeds = (
+            out["context_input_ids"],
+            out["context_attention_mask"],
+            out["retrieved_doc_embeds"],
+        )
+        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
+        self.assertIsInstance(context_input_ids, list)
+        self.assertIsInstance(context_attention_mask, list)
+        self.assertIsInstance(retrieved_doc_embeds, np.ndarray)
+
+        out = retriever(
+            question_input_ids,
+            hidden_states,
+            prefix=retriever.config.generator.prefix,
+            n_docs=n_docs,
+            return_tensors="pt",
+        )
+        context_input_ids, context_attention_mask, retrieved_doc_embeds, doc_ids = (  # noqa: F841
+            out["context_input_ids"],
+            out["context_attention_mask"],
+            out["retrieved_doc_embeds"],
+            out["doc_ids"],
+        )
+        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
+        self.assertIsInstance(context_input_ids, torch.Tensor)
+        self.assertIsInstance(context_attention_mask, torch.Tensor)
+        self.assertIsInstance(retrieved_doc_embeds, torch.Tensor)
+
+    @require_torch
+    @require_tokenizers
+    @require_sentencepiece
+    def test_custom_hf_index_end2end_retriever_call(self):
+
+        context_encoder_tokenizer = self.get_dpr_ctx_encoder_tokenizer()
+        n_docs = 1
+        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
+        retriever.set_ctx_encoder_tokenizer(context_encoder_tokenizer)
+
+        question_input_ids = [[5, 7], [10, 11]]
+        hidden_states = np.array(
+            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
+        )
+        out = retriever(question_input_ids, hidden_states, prefix=retriever.config.generator.prefix, n_docs=n_docs)
+
+        self.assertEqual(
+            len(out), 6
+        )  # check whether the retriever output consist of 6 attributes including tokenized docs
+        self.assertEqual(
+            all(k in out for k in ("tokenized_doc_ids", "tokenized_doc_attention_mask")), True
+        )  # check for doc token related keys in dictionary.
diff --git a/test_sequence_feature_extraction_common.py b/test_sequence_feature_extraction_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..f375e10e19fb64b90dd536d60d64ca41ff81b4da
--- /dev/null
+++ b/test_sequence_feature_extraction_common.py
@@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from transformers import BatchFeature
+from transformers.testing_utils import require_tf, require_torch
+
+from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
+
+
+class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
+
+    # to overwrite at feature extractactor specific tests
+    feat_extract_tester = None
+    feature_extraction_class = None
+
+    @property
+    def feat_extract_dict(self):
+        return self.feat_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_common_properties(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feat_extract, "feature_size"))
+        self.assertTrue(hasattr(feat_extract, "sampling_rate"))
+        self.assertTrue(hasattr(feat_extract, "padding_value"))
+
+    def test_batch_feature(self):
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs})
+
+        self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
+
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
+        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
+
+        batch_features_input = processed_features[input_name]
+
+        if len(batch_features_input.shape) < 3:
+            batch_features_input = batch_features_input[:, :, None]
+
+        self.assertTrue(
+            batch_features_input.shape
+            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
+        )
+
+    @require_torch
+    def test_batch_feature_pt(self):
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
+
+        batch_features_input = processed_features[input_name]
+
+        if len(batch_features_input.shape) < 3:
+            batch_features_input = batch_features_input[:, :, None]
+
+        self.assertTrue(
+            batch_features_input.shape
+            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
+        )
+
+    @require_tf
+    def test_batch_feature_tf(self):
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="tf")
+
+        batch_features_input = processed_features[input_name]
+
+        if len(batch_features_input.shape) < 3:
+            batch_features_input = batch_features_input[:, :, None]
+
+        self.assertTrue(
+            batch_features_input.shape
+            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
+        )
+
+    def _check_padding(self, numpify=False):
+        def _inputs_have_equal_length(input):
+            length = len(input[0])
+            for input_slice in input[1:]:
+                if len(input_slice) != length:
+                    return False
+            return True
+
+        def _inputs_are_equal(input_1, input_2):
+            if len(input_1) != len(input_2):
+                return False
+
+            for input_slice_1, input_slice_2 in zip(input_1, input_2):
+                if not np.allclose(np.asarray(input_slice_1), np.asarray(input_slice_2), atol=1e-3):
+                    return False
+            return True
+
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(numpify=numpify)
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs})
+
+        pad_diff = self.feat_extract_tester.seq_length_diff
+        pad_max_length = self.feat_extract_tester.max_seq_length + pad_diff
+        pad_min_length = self.feat_extract_tester.min_seq_length
+        batch_size = self.feat_extract_tester.batch_size
+        feature_size = self.feat_extract_tester.feature_size
+
+        # test padding for List[int] + numpy
+        input_1 = feat_extract.pad(processed_features, padding=False)[input_name]
+        input_2 = feat_extract.pad(processed_features, padding="longest")[input_name]
+        input_3 = feat_extract.pad(processed_features, padding="max_length", max_length=len(speech_inputs[-1]))[
+            input_name
+        ]
+        input_4 = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
+
+        # max_length parameter has to be provided when setting `padding="max_length"`
+        with self.assertRaises(ValueError):
+            feat_extract.pad(processed_features, padding="max_length")[input_name]
+
+        input_5 = feat_extract.pad(
+            processed_features, padding="max_length", max_length=pad_max_length, return_tensors="np"
+        )[input_name]
+
+        self.assertFalse(_inputs_have_equal_length(input_1))
+        self.assertTrue(_inputs_have_equal_length(input_2))
+        self.assertTrue(_inputs_have_equal_length(input_3))
+        self.assertTrue(_inputs_are_equal(input_2, input_3))
+        self.assertTrue(len(input_1[0]) == pad_min_length)
+        self.assertTrue(len(input_1[1]) == pad_min_length + pad_diff)
+        self.assertTrue(input_4.shape[:2] == (batch_size, len(input_3[0])))
+        self.assertTrue(input_5.shape[:2] == (batch_size, pad_max_length))
+
+        if feature_size > 1:
+            self.assertTrue(input_4.shape[2] == input_5.shape[2] == feature_size)
+
+        # test padding for `pad_to_multiple_of` for List[int] + numpy
+        input_6 = feat_extract.pad(processed_features, pad_to_multiple_of=10)[input_name]
+        input_7 = feat_extract.pad(processed_features, padding="longest", pad_to_multiple_of=10)[input_name]
+        input_8 = feat_extract.pad(
+            processed_features, padding="max_length", pad_to_multiple_of=10, max_length=pad_max_length
+        )[input_name]
+        input_9 = feat_extract.pad(
+            processed_features,
+            padding="max_length",
+            pad_to_multiple_of=10,
+            max_length=pad_max_length,
+            return_tensors="np",
+        )[input_name]
+
+        self.assertTrue(all(len(x) % 10 == 0 for x in input_6))
+        self.assertTrue(_inputs_are_equal(input_6, input_7))
+
+        expected_mult_pad_length = pad_max_length if pad_max_length % 10 == 0 else (pad_max_length // 10 + 1) * 10
+        self.assertTrue(all(len(x) == expected_mult_pad_length for x in input_8))
+        self.assertTrue(input_9.shape[:2], (batch_size, expected_mult_pad_length))
+
+        if feature_size > 1:
+            self.assertTrue(input_9.shape[2] == feature_size)
+
+        # Check padding value is correct
+        padding_vector_sum = (np.ones(self.feat_extract_tester.feature_size) * feat_extract.padding_value).sum()
+        self.assertTrue(
+            abs(np.asarray(input_2[0])[pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length))
+            < 1e-3
+        )
+        self.assertTrue(
+            abs(
+                np.asarray(input_2[1])[pad_min_length + pad_diff :].sum()
+                - padding_vector_sum * (pad_max_length - pad_min_length - pad_diff)
+            )
+            < 1e-3
+        )
+        self.assertTrue(
+            abs(
+                np.asarray(input_2[2])[pad_min_length + 2 * pad_diff :].sum()
+                - padding_vector_sum * (pad_max_length - pad_min_length - 2 * pad_diff)
+            )
+            < 1e-3
+        )
+        self.assertTrue(
+            abs(input_5[0, pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length)) < 1e-3
+        )
+        self.assertTrue(
+            abs(input_9[0, pad_min_length:].sum() - padding_vector_sum * (expected_mult_pad_length - pad_min_length))
+            < 1e-3
+        )
+
+    def test_padding_from_list(self):
+        self._check_padding(numpify=False)
+
+    def test_padding_from_array(self):
+        self._check_padding(numpify=True)
+
+    @require_torch
+    def test_padding_accepts_tensors_pt(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs})
+
+        input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
+        input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
+
+        self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().sum()) < 1e-2)
+
+    @require_tf
+    def test_padding_accepts_tensors_tf(self):
+        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
+        input_name = feat_extract.model_input_names[0]
+
+        processed_features = BatchFeature({input_name: speech_inputs})
+
+        input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
+        input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name]
+
+        self.assertTrue(abs(input_np.astype(np.float32).sum() - input_tf.numpy().sum()) < 1e-2)
+
+    def test_attention_mask(self):
+        feat_dict = self.feat_extract_dict
+        feat_dict["return_attention_mask"] = True
+        feat_extract = self.feature_extraction_class(**feat_dict)
+        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
+        input_lenghts = [len(x) for x in speech_inputs]
+        input_name = feat_extract.model_input_names[0]
+
+        processed = BatchFeature({input_name: speech_inputs})
+
+        processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
+        self.assertIn("attention_mask", processed)
+        self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
+        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
diff --git a/test_skip_decorators.py b/test_skip_decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ff0e3bafdc2b5747a526b4c51207621b2710bf
--- /dev/null
+++ b/test_skip_decorators.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+# this test validates that we can stack skip decorators in groups and whether
+# they work correctly with other decorators
+#
+# since the decorators have already built their decision params (like checking
+# env[], we can't mock the env and test each of the combinations), so ideally
+# the following 4 should be run. But since we have different CI jobs running
+# different configs, all combinations should get covered
+#
+# RUN_SLOW=1 pytest -rA tests/test_skip_decorators.py
+# RUN_SLOW=1 CUDA_VISIBLE_DEVICES="" pytest -rA tests/test_skip_decorators.py
+# RUN_SLOW=0 pytest -rA tests/test_skip_decorators.py
+# RUN_SLOW=0 CUDA_VISIBLE_DEVICES="" pytest -rA tests/test_skip_decorators.py
+
+import os
+import unittest
+
+import pytest
+
+from parameterized import parameterized
+from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+
+
+# skipping in unittest tests
+
+params = [(1,)]
+
+
+# test that we can stack our skip decorators with 3rd party decorators
+def check_slow():
+    run_slow = bool(os.getenv("RUN_SLOW", 0))
+    if run_slow:
+        assert True
+    else:
+        assert False, "should have been skipped"
+
+
+# test that we can stack our skip decorators
+def check_slow_torch_cuda():
+    run_slow = bool(os.getenv("RUN_SLOW", 0))
+    if run_slow and torch_device == "cuda":
+        assert True
+    else:
+        assert False, "should have been skipped"
+
+
+@require_torch
+class SkipTester(unittest.TestCase):
+    @slow
+    @require_torch_gpu
+    def test_2_skips_slow_first(self):
+        check_slow_torch_cuda()
+
+    @require_torch_gpu
+    @slow
+    def test_2_skips_slow_last(self):
+        check_slow_torch_cuda()
+
+    # The combination of any skip decorator, followed by parameterized fails to skip the tests
+    # 1. @slow manages to correctly skip `test_param_slow_first`
+    # 2. but then `parameterized` creates new tests, with a unique name for each parameter groups.
+    #    It has no idea that they are to be skipped and so they all run, ignoring @slow
+    # Therefore skip decorators must come after `parameterized`
+    #
+    # @slow
+    # @parameterized.expand(params)
+    # def test_param_slow_first(self, param=None):
+    #     check_slow()
+
+    # This works as expected:
+    # 1. `parameterized` creates new tests with unique names
+    # 2. each of them gets an opportunity to be skipped
+    @parameterized.expand(params)
+    @slow
+    def test_param_slow_last(self, param=None):
+        check_slow()
+
+
+# skipping in non-unittest tests
+# no problem at all here
+
+
+@slow
+@require_torch_gpu
+def test_pytest_2_skips_slow_first():
+    check_slow_torch_cuda()
+
+
+@require_torch_gpu
+@slow
+def test_pytest_2_skips_slow_last():
+    check_slow_torch_cuda()
+
+
+@slow
+@pytest.mark.parametrize("param", [1])
+def test_pytest_param_slow_first(param):
+    check_slow()
+
+
+@pytest.mark.parametrize("param", [1])
+@slow
+def test_pytest_param_slow_last(param):
+    check_slow()
diff --git a/test_tokenization_albert.py b/test_tokenization_albert.py
new file mode 100644
index 0000000000000000000000000000000000000000..e965f52de2aa81b2173baa74de9b40921d0ce1c2
--- /dev/null
+++ b/test_tokenization_albert.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2019 Hugging Face inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import AlbertTokenizer, AlbertTokenizerFast
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = AlbertTokenizer
+    rust_tokenizer_class = AlbertTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+    test_sentencepiece_ignore_case = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "this is a test"
+        output_text = "this is a test"
+        return input_text, output_text
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<pad>")
+        self.assertEqual(vocab_keys[1], "<unk>")
+        self.assertEqual(vocab_keys[-1], "▁eloquent")
+        self.assertEqual(len(vocab_keys), 30_000)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_full_tokenizer(self):
+        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
+        )
+
+    def test_sequence_builders(self):
+        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'input_ids': [[2, 21970, 13, 5, 6092, 167, 28, 7103, 2153, 673, 8, 7028, 12051, 18, 17, 7103, 2153, 673, 8, 3515, 18684, 8, 4461, 6, 1927, 297, 8, 12060, 2607, 18, 13, 5, 4461, 15, 10538, 38, 8, 135, 15, 822, 58, 15, 993, 10363, 15, 1460, 8005, 4461, 15, 993, 255, 2328, 9, 9, 9, 6, 26, 1112, 816, 3260, 13, 5, 103, 2377, 6, 17, 1112, 816, 2782, 13, 5, 103, 10641, 6, 29, 84, 2512, 2430, 782, 18684, 2761, 19, 808, 2430, 2556, 17, 855, 1480, 9477, 4091, 128, 11712, 15, 7103, 2153, 673, 17, 24883, 9990, 9, 3], [2, 11502, 25, 1006, 20, 782, 8, 11809, 855, 1732, 19393, 18667, 37, 367, 21018, 69, 1854, 34, 11860, 19124, 27, 156, 225, 17, 193, 4141, 19, 65, 9124, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="albert-base-v2",
+            revision="6b6560eaf5ff2e250b00c50f380c5389a9c2d82e",
+        )
diff --git a/test_tokenization_auto.py b/test_tokenization_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..72db79d1c52d0d8f68710d4f60f132b7d636dcae
--- /dev/null
+++ b/test_tokenization_auto.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import (
+    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    AutoTokenizer,
+    BertTokenizer,
+    BertTokenizerFast,
+    GPT2Tokenizer,
+    GPT2TokenizerFast,
+    PreTrainedTokenizerFast,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+)
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING
+from transformers.models.roberta.configuration_roberta import RobertaConfig
+from transformers.testing_utils import (
+    DUMMY_DIFF_TOKENIZER_IDENTIFIER,
+    DUMMY_UNKWOWN_IDENTIFIER,
+    SMALL_MODEL_IDENTIFIER,
+    require_tokenizers,
+    slow,
+)
+
+
+class AutoTokenizerTest(unittest.TestCase):
+    @slow
+    def test_tokenizer_from_pretrained(self):
+        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+            self.assertGreater(len(tokenizer), 0)
+
+        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
+            self.assertGreater(len(tokenizer), 0)
+
+    def test_tokenizer_from_pretrained_identifier(self):
+        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+        self.assertEqual(tokenizer.vocab_size, 12)
+
+    def test_tokenizer_from_model_type(self):
+        tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
+        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
+        self.assertEqual(tokenizer.vocab_size, 20)
+
+    def test_tokenizer_from_tokenizer_class(self):
+        config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER)
+        self.assertIsInstance(config, RobertaConfig)
+        # Check that tokenizer_type ≠ model_type
+        tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
+        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+        self.assertEqual(tokenizer.vocab_size, 12)
+
+    @require_tokenizers
+    def test_tokenizer_identifier_with_correct_config(self):
+        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
+            tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
+            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+
+            if isinstance(tokenizer, BertTokenizer):
+                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
+            else:
+                self.assertEqual(tokenizer.do_lower_case, False)
+
+            self.assertEqual(tokenizer.model_max_length, 512)
+
+    @require_tokenizers
+    def test_tokenizer_identifier_non_existent(self):
+        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
+            with self.assertRaises(EnvironmentError):
+                _ = tokenizer_class.from_pretrained("julien-c/herlolip-not-exists")
+
+    def test_parents_and_children_in_mappings(self):
+        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
+        # by the parents and will return the wrong configuration type when using auto models
+
+        mappings = (TOKENIZER_MAPPING,)
+
+        for mapping in mappings:
+            mapping = tuple(mapping.items())
+            for index, (child_config, _) in enumerate(mapping[1:]):
+                for parent_config, _ in mapping[: index + 1]:
+                    with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
+                        self.assertFalse(issubclass(child_config, parent_config))
+
+    @require_tokenizers
+    def test_from_pretrained_use_fast_toggle(self):
+        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
+        self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)
+
+    @require_tokenizers
+    def test_do_lower_case(self):
+        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", do_lower_case=False)
+        sample = "Hello, world. How are you?"
+        tokens = tokenizer.tokenize(sample)
+        self.assertEqual("[UNK]", tokens[0])
+
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base", do_lower_case=False)
+        tokens = tokenizer.tokenize(sample)
+        self.assertEqual("[UNK]", tokens[0])
+
+    @require_tokenizers
+    def test_PreTrainedTokenizerFast_from_pretrained(self):
+        tokenizer = AutoTokenizer.from_pretrained("robot-test/dummy-tokenizer-fast-with-model-config")
+        self.assertEqual(type(tokenizer), PreTrainedTokenizerFast)
+        self.assertEqual(tokenizer.model_max_length, 512)
+        self.assertEqual(tokenizer.vocab_size, 30000)
+        self.assertEqual(tokenizer.unk_token, "[UNK]")
+        self.assertEqual(tokenizer.padding_side, "right")
diff --git a/test_tokenization_bart.py b/test_tokenization_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a289572688f49d63cf3d0735bff9dd2aa58b05d
--- /dev/null
+++ b/test_tokenization_bart.py
@@ -0,0 +1,185 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import unittest
+
+from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
+from transformers.file_utils import cached_property
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers, require_torch
+
+from .test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
+
+
+@require_tokenizers
+class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = BartTokenizer
+    rust_tokenizer_class = BartTokenizerFast
+    test_rust_tokenizer = True
+    from_pretrained_filter = filter_roberta_detectors
+    # from_pretrained_kwargs = {'add_prefix_space': True}
+
+    def setUp(self):
+        super().setUp()
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        return "lower newer", "lower newer"
+
+    @cached_property
+    def default_tokenizer(self):
+        return BartTokenizer.from_pretrained("facebook/bart-large")
+
+    @cached_property
+    def default_tokenizer_fast(self):
+        return BartTokenizerFast.from_pretrained("facebook/bart-large")
+
+    @require_torch
+    def test_prepare_batch(self):
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        expected_src_tokens = [0, 250, 251, 17818, 13, 39186, 1938, 4, 2]
+
+        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
+            batch = tokenizer(src_text, max_length=len(expected_src_tokens), padding=True, return_tensors="pt")
+            self.assertIsInstance(batch, BatchEncoding)
+
+            self.assertEqual((2, 9), batch.input_ids.shape)
+            self.assertEqual((2, 9), batch.attention_mask.shape)
+            result = batch.input_ids.tolist()[0]
+            self.assertListEqual(expected_src_tokens, result)
+            # Test that special tokens are reset
+
+    @require_torch
+    def test_prepare_batch_empty_target_text(self):
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
+            batch = tokenizer(src_text, padding=True, return_tensors="pt")
+            # check if input_ids are returned and no labels
+            self.assertIn("input_ids", batch)
+            self.assertIn("attention_mask", batch)
+            self.assertNotIn("labels", batch)
+            self.assertNotIn("decoder_attention_mask", batch)
+
+    @require_torch
+    def test_as_target_tokenizer_target_length(self):
+        tgt_text = [
+            "Summary of the text.",
+            "Another summary.",
+        ]
+        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
+            with tokenizer.as_target_tokenizer():
+                targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt")
+            self.assertEqual(32, targets["input_ids"].shape[1])
+
+    @require_torch
+    def test_prepare_batch_not_longer_than_maxlen(self):
+        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
+            batch = tokenizer(
+                ["I am a small frog" * 1024, "I am a small frog"], padding=True, truncation=True, return_tensors="pt"
+            )
+            self.assertIsInstance(batch, BatchEncoding)
+            self.assertEqual(batch.input_ids.shape, (2, 1024))
+
+    @require_torch
+    def test_special_tokens(self):
+
+        src_text = ["A long paragraph for summarization."]
+        tgt_text = [
+            "Summary of the text.",
+        ]
+        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
+            inputs = tokenizer(src_text, return_tensors="pt")
+            with tokenizer.as_target_tokenizer():
+                targets = tokenizer(tgt_text, return_tensors="pt")
+            input_ids = inputs["input_ids"]
+            labels = targets["input_ids"]
+            self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())
+            self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item())
+            self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
+            self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item())
+
+    def test_pretokenized_inputs(self):
+        pass
+
+    def test_embeded_special_tokens(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                sentence = "A, <mask> AllenNLP sentence."
+                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+
+                # token_type_ids should put 0 everywhere
+                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+
+                # attention_mask should put 1 everywhere, so sum over length should be 1
+                self.assertEqual(
+                    sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
+                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
+                )
+
+                tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+
+                # Rust correctly handles the space before the mask while python doesnt
+                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+                self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+
+                self.assertSequenceEqual(
+                    tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
+                )
+                self.assertSequenceEqual(
+                    tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
+                )
diff --git a/test_tokenization_barthez.py b/test_tokenization_barthez.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ba5b1582361c75a0b07f4c10075fdcaaa4eb38
--- /dev/null
+++ b/test_tokenization_barthez.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2020 Ecole Polytechnique and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import BarthezTokenizer, BarthezTokenizerFast, BatchEncoding
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+@require_sentencepiece
+@slow  # see https://github.com/huggingface/transformers/issues/11457
+class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BarthezTokenizer
+    rust_tokenizer_class = BarthezTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
+        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname, legacy_format=False)
+        self.tokenizer = tokenizer
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "<mask>")
+        self.assertEqual(len(vocab_keys), 101_122)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 101_122)
+
+    @require_torch
+    def test_prepare_batch(self):
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        expected_src_tokens = [0, 57, 3018, 70307, 91, 2]
+
+        batch = self.tokenizer(
+            src_text, max_length=len(expected_src_tokens), padding=True, truncation=True, return_tensors="pt"
+        )
+        self.assertIsInstance(batch, BatchEncoding)
+
+        self.assertEqual((2, 6), batch.input_ids.shape)
+        self.assertEqual((2, 6), batch.attention_mask.shape)
+        result = batch.input_ids.tolist()[0]
+        self.assertListEqual(expected_src_tokens, result)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[0, 490, 14328, 4507, 354, 47, 43669, 95, 25, 78117, 20215, 19779, 190, 22, 400, 4, 35343, 80310, 603, 86, 24937, 105, 33438, 94762, 196, 39642, 7, 15, 15933, 173, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 10534, 87, 25, 66, 3358, 196, 55289, 8, 82961, 81, 2204, 75203, 7, 15, 763, 12956, 216, 178, 14328, 9595, 1377, 69693, 7, 448, 71021, 196, 18106, 1437, 13974, 108, 9083, 4, 49315, 7, 39, 86, 1326, 2793, 46333, 4, 448, 196, 74588, 7, 49315, 7, 39, 21, 822, 38470, 74, 21, 66723, 62480, 8, 22050, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # noqa: E501
+        # fmt: on
+
+        # moussaKam/mbarthez is a french model. So we also use french texts.
+        sequences = [
+            "Le transformeur est un modèle d'apprentissage profond introduit en 2017, "
+            "utilisé principalement dans le domaine du traitement automatique des langues (TAL).",
+            "À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus "
+            "pour gérer des données séquentielles, telles que le langage naturel, pour des tâches "
+            "telles que la traduction et la synthèse de texte.",
+        ]
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="moussaKam/mbarthez",
+            revision="c2e4ecbca5e3cd2c37fe1ac285ca4fbdf1366fb6",
+            sequences=sequences,
+        )
diff --git a/test_tokenization_bert.py b/test_tokenization_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b8dced0ab4a98af38fba75b9ad8eab1b49010d5
--- /dev/null
+++ b/test_tokenization_bert.py
@@ -0,0 +1,301 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers import BertTokenizerFast
+from transformers.models.bert.tokenization_bert import (
+    VOCAB_FILES_NAMES,
+    BasicTokenizer,
+    BertTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.testing_utils import require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin, filter_non_english
+
+
+@require_tokenizers
+class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BertTokenizer
+    rust_tokenizer_class = BertTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+    from_pretrained_filter = filter_non_english
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # With lower casing
+        tokenizer = self.get_tokenizer(do_lower_case=True)
+        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        self.assertListEqual(
+            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+    def test_offsets_with_special_characters(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
+                    sentence,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    return_offsets_mapping=True,
+                    add_special_tokens=True,
+                )
+
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
+                expected_results = (
+                    [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "A"),
+                        ((1, 2), ","),
+                        ((3, 5), "na"),
+                        ((5, 6), "##ï"),
+                        ((6, 8), "##ve"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "Allen"),
+                        ((21, 23), "##NL"),
+                        ((23, 24), "##P"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                    if not do_lower_case
+                    else [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "a"),
+                        ((1, 2), ","),
+                        ((3, 8), "naive"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "allen"),
+                        ((21, 23), "##nl"),
+                        ((23, 24), "##p"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                )
+
+                self.assertEqual(
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
diff --git a/test_tokenization_bert_generation.py b/test_tokenization_bert_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..40d3f1bae84e91307f45cf5e9b1e1194f258507a
--- /dev/null
+++ b/test_tokenization_bert_generation.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import BertGenerationTokenizer
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SPIECE_UNDERLINE = "▁"
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BertGenerationTokenizer
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+        self.assertEqual(vocab_keys[-1], "<pad>")
+        self.assertEqual(len(vocab_keys), 1_002)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
+
+    def test_full_tokenizer(self):
+        tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def big_tokenizer(self):
+        return BertGenerationTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
+
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [18536, 2260, 101]
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenization_base_hard_symbols(self):
+        symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
+        original_tokenizer_encodings = [
+            871,
+            419,
+            358,
+            946,
+            991,
+            2521,
+            452,
+            358,
+            1357,
+            387,
+            7751,
+            3536,
+            112,
+            985,
+            456,
+            126,
+            865,
+            938,
+            5400,
+            5734,
+            458,
+            1368,
+            467,
+            786,
+            2462,
+            5246,
+            1159,
+            633,
+            865,
+            4519,
+            457,
+            582,
+            852,
+            2557,
+            427,
+            916,
+            508,
+            405,
+            34324,
+            497,
+            391,
+            408,
+            11342,
+            1244,
+            385,
+            100,
+            938,
+            985,
+            456,
+            574,
+            362,
+            12597,
+            3200,
+            3129,
+            1172,
+        ]
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @require_torch
+    @slow
+    def test_torch_encode_plus_sent_to_model(self):
+        import torch
+
+        from transformers import BertGenerationConfig, BertGenerationEncoder
+
+        # Build sequence
+        first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
+        sequence = " ".join(first_ten_tokens)
+        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False)
+        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus(
+            [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
+        )
+
+        config = BertGenerationConfig()
+        model = BertGenerationEncoder(config)
+
+        assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size
+
+        with torch.no_grad():
+            model(**encoded_sequence)
+            model(**batch_encoded_sequence)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114], [448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="google/bert_for_seq_generation_L-24_bbc_encoder",
+            revision="c817d1fd1be2ffa69431227a1fe320544943d4db",
+        )
diff --git a/test_tokenization_bert_japanese.py b/test_tokenization_bert_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..b42a14314a4ea2a3aca4f282d5c7d59c87654a58
--- /dev/null
+++ b/test_tokenization_bert_japanese.py
@@ -0,0 +1,280 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import pickle
+import unittest
+
+from transformers import AutoTokenizer
+from transformers.models.bert_japanese.tokenization_bert_japanese import (
+    VOCAB_FILES_NAMES,
+    BertJapaneseTokenizer,
+    CharacterTokenizer,
+    MecabTokenizer,
+    WordpieceTokenizer,
+)
+from transformers.testing_utils import custom_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@custom_tokenizers
+class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BertJapaneseTokenizer
+    test_rust_tokenizer = False
+    space_between_special_tokens = True
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "こんにちは",
+            "こん",
+            "にちは",
+            "ばんは",
+            "##こん",
+            "##にちは",
+            "##ばんは",
+            "世界",
+            "##世界",
+            "、",
+            "##、",
+            "。",
+            "##。",
+        ]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "こんにちは、世界。 \nこんばんは、世界。"
+        output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
+        return input_text, output_text
+
+    def get_clean_sequence(self, tokenizer):
+        input_text, output_text = self.get_input_output_texts(tokenizer)
+        ids = tokenizer.encode(output_text, add_special_tokens=False)
+        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        return text, ids
+
+    def test_pretokenized_inputs(self):
+        pass  # TODO add if relevant
+
+    def test_maximum_encoding_length_pair_input(self):
+        pass  # TODO add if relevant
+
+    def test_maximum_encoding_length_single_input(self):
+        pass  # TODO add if relevant
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
+    def test_pickle_mecab_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
+        self.assertIsNotNone(tokenizer)
+
+        text = "こんにちは、世界。\nこんばんは、世界。"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
+        filename = os.path.join(self.tmpdirname, "tokenizer.bin")
+        with open(filename, "wb") as handle:
+            pickle.dump(tokenizer, handle)
+
+        with open(filename, "rb") as handle:
+            tokenizer_new = pickle.load(handle)
+
+        tokens_loaded = tokenizer_new.tokenize(text)
+
+        self.assertListEqual(tokens, tokens_loaded)
+
+    def test_mecab_tokenizer_ipadic(self):
+        tokenizer = MecabTokenizer(mecab_dic="ipadic")
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
+
+    def test_mecab_tokenizer_unidic_lite(self):
+        try:
+            tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
+        except ModuleNotFoundError:
+            return
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
+
+    def test_mecab_tokenizer_unidic(self):
+        try:
+            tokenizer = MecabTokenizer(mecab_dic="unidic")
+        except ModuleNotFoundError:
+            return
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
+
+    def test_mecab_tokenizer_lower(self):
+        tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
+
+    def test_mecab_tokenizer_with_option(self):
+        try:
+            tokenizer = MecabTokenizer(
+                do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"
+            )
+        except RuntimeError:
+            # if dict doesn't exist in the system, previous code raises this error.
+            return
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れた", "\u3000", "。"],
+        )
+
+    def test_mecab_tokenizer_no_normalize(self):
+        tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れ", "た", "　", "。"],
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"])
+
+        self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"])
+
+        self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
+
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese")
+
+        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+@custom_tokenizers
+class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BertJapaneseTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "こんにちは、世界。 \nこんばんは、世界。"
+        output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        return input_text, output_text
+
+    def test_pretokenized_inputs(self):
+        pass  # TODO add if relevant
+
+    def test_maximum_encoding_length_pair_input(self):
+        pass  # TODO add if relevant
+
+    def test_maximum_encoding_length_single_input(self):
+        pass  # TODO add if relevant
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character")
+
+        tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(
+            tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"]
+        )
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12]
+        )
+
+    def test_character_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界" "、", "。"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こ", "ん", "に", "ち", "は"])
+
+        self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["こ", "ん", "に", "ち", "[UNK]"])
+
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese-char")
+
+        text = tokenizer.encode("ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+@custom_tokenizers
+class AutoTokenizerCustomTest(unittest.TestCase):
+    def test_tokenizer_bert_japanese(self):
+        EXAMPLE_BERT_JAPANESE_ID = "cl-tohoku/bert-base-japanese"
+        tokenizer = AutoTokenizer.from_pretrained(EXAMPLE_BERT_JAPANESE_ID)
+        self.assertIsInstance(tokenizer, BertJapaneseTokenizer)
diff --git a/test_tokenization_bertweet.py b/test_tokenization_bertweet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf7d5c779819b38045d87c90ae7a502cb6ebe483
--- /dev/null
+++ b/test_tokenization_bertweet.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BertweetTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "a m</w>"]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            for token in vocab_tokens:
+                fp.write(f"{token} {vocab_tokens[token]}\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return BertweetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "I am VinAI Research"
+        output_text = "I <unk> m V<unk> <unk> <unk> I Re<unk> e<unk> <unk> <unk> <unk>"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = BertweetTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "I am VinAI Research"
+        bpe_tokens = "I a@@ m V@@ i@@ n@@ A@@ I R@@ e@@ s@@ e@@ a@@ r@@ c@@ h".split()
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+
+        input_bpe_tokens = [4, 3, 5, 6, 3, 3, 3, 4, 7, 9, 3, 9, 3, 3, 3, 3, 3]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/test_tokenization_big_bird.py b/test_tokenization_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c933ade97f1bde09014af9e391de2c80823d975
--- /dev/null
+++ b/test_tokenization_big_bird.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import BigBirdTokenizer, BigBirdTokenizerFast
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SPIECE_UNDERLINE = "▁"
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BigBirdTokenizer
+    rust_tokenizer_class = BigBirdTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+        self.assertEqual(vocab_keys[-1], "[MASK]")
+        self.assertEqual(len(vocab_keys), 1_004)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_full_tokenizer(self):
+        tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def big_tokenizer(self):
+        return BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [65, 18536, 2260, 101, 66]
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenization_base_hard_symbols(self):
+        symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
+        # fmt: off
+        original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66]  # noqa: E231
+        # fmt: on
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @require_torch
+    @slow
+    def test_torch_encode_plus_sent_to_model(self):
+        import torch
+
+        from transformers import BigBirdConfig, BigBirdModel
+
+        # Build sequence
+        first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
+        sequence = " ".join(first_ten_tokens)
+        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False)
+        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus(
+            [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
+        )
+
+        config = BigBirdConfig(attention_type="original_full")
+        model = BigBirdModel(config)
+
+        assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size
+
+        with torch.no_grad():
+            model(**encoded_sequence)
+            model(**batch_encoded_sequence)
+
+    @slow
+    def test_special_tokens(self):
+        """
+        To reproduce:
+
+        $ wget https://github.com/google-research/bigbird/blob/master/bigbird/vocab/gpt2.model?raw=true
+        $ mv gpt2.model?raw=true gpt2.model
+
+        ```
+        import tensorflow_text as tft
+        import tensorflow as tf
+
+        vocab_model_file = "./gpt2.model"
+        tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(vocab_model_file, "rb").read()))
+        ids = tokenizer.tokenize("Paris is the [MASK].")
+        ids = tf.concat([tf.constant([65]), ids, tf.constant([66])], axis=0)
+        detokenized = tokenizer.detokenize(ids)  # should give [CLS] Paris is the [MASK].[SEP]
+        """
+        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+        decoded_text = tokenizer.decode(tokenizer("Paris is the [MASK].").input_ids)
+
+        self.assertTrue(decoded_text == "[CLS] Paris is the [MASK].[SEP]")
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[65, 39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114, 66], [65, 448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [65, 484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="google/bigbird-roberta-base",
+            revision="215c99f1600e06f83acce68422f2035b2b5c3510",
+        )
diff --git a/test_tokenization_blenderbot.py b/test_tokenization_blenderbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb4eacfb4b8bf94df2785ab564c597939a02471
--- /dev/null
+++ b/test_tokenization_blenderbot.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
+import unittest
+
+from transformers.file_utils import cached_property
+from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer
+
+
+class Blenderbot3BTokenizerTests(unittest.TestCase):
+    @cached_property
+    def tokenizer_3b(self):
+        return BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
+
+    def test_encode_decode_cycle(self):
+        tok = self.tokenizer_3b
+        src_text = " I am a small frog."
+        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
+        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        assert src_text == decoded
+
+    def test_3B_tokenization_same_as_parlai(self):
+        assert self.tokenizer_3b.add_prefix_space
+        assert self.tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
diff --git a/test_tokenization_byt5.py b/test_tokenization_byt5.py
new file mode 100644
index 0000000000000000000000000000000000000000..79c2f0005c9a4eecf00a1ad1577f44c025d72913
--- /dev/null
+++ b/test_tokenization_byt5.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2020 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from transformers import BatchEncoding, ByT5Tokenizer
+from transformers.file_utils import cached_property, is_tf_available, is_torch_available
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+if is_torch_available():
+    FRAMEWORK = "pt"
+elif is_tf_available():
+    FRAMEWORK = "tf"
+else:
+    FRAMEWORK = "jax"
+
+
+class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = ByT5Tokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+        tokenizer = ByT5Tokenizer()
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    @cached_property
+    def t5_base_tokenizer(self):
+        return ByT5Tokenizer.from_pretrained("google/byt5-small")
+
+    def get_tokenizer(self, **kwargs) -> ByT5Tokenizer:
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def test_eos_treatment(self):
+        tokenizer = self.t5_base_tokenizer
+        batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])
+        batch_without_eos_added = tokenizer(["hi", "I went to the gym", ""])
+        self.assertListEqual(batch_with_eos_added["input_ids"], batch_without_eos_added["input_ids"])
+
+    def test_prepare_batch_integration(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        # fmt: off
+        expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 1, 0]
+        # fmt: on
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        self.assertIsInstance(batch, BatchEncoding)
+
+        if FRAMEWORK != "jax":
+            result = list(batch.input_ids.numpy()[0])
+        else:
+            result = list(batch.input_ids.tolist()[0])
+
+        self.assertListEqual(expected_src_tokens, result)
+
+        self.assertEqual((2, 37), batch.input_ids.shape)
+        self.assertEqual((2, 37), batch.attention_mask.shape)
+
+    def test_empty_target_text(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        # check if input_ids are returned and no decoder_input_ids
+        self.assertIn("input_ids", batch)
+        self.assertIn("attention_mask", batch)
+        self.assertNotIn("decoder_input_ids", batch)
+        self.assertNotIn("decoder_attention_mask", batch)
+
+    def test_max_length_integration(self):
+        tokenizer = self.t5_base_tokenizer
+        tgt_text = [
+            "Summary of the text.",
+            "Another summary.",
+        ]
+        with tokenizer.as_target_tokenizer():
+            targets = tokenizer(
+                tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
+            )
+        self.assertEqual(32, targets["input_ids"].shape[1])
+
+    def test_eos_in_input(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization. </s>"]
+        tgt_text = ["Summary of the text. </s>"]
+        # fmt: off
+        expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 35, 1]
+        expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1]
+        # fmt: on
+
+        batch = tokenizer(src_text)
+        with tokenizer.as_target_tokenizer():
+            targets = tokenizer(tgt_text)
+
+        self.assertEqual(expected_src_tokens, batch["input_ids"][0])
+        self.assertEqual(expected_tgt_tokens, targets["input_ids"][0])
+
+    # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                self.assertListEqual(before_tokens, after_tokens)
+
+                shutil.rmtree(tmpdirname)
+
+        tokenizers = self.get_tokenizers(model_max_length=42)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                tokenizer.add_tokens(["bim", "bambam"])
+                additional_special_tokens = tokenizer.additional_special_tokens
+                additional_special_tokens.append("new_additional_special_token")
+                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
+                self.assertEqual(after_tokenizer.model_max_length, 42)
+
+                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
+                self.assertEqual(tokenizer.model_max_length, 43)
+
+                shutil.rmtree(tmpdirname)
+
+    # tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
+    def test_pretrained_model_lists(self):
+        pass
+
+    # tokenizer does not have vocabulary
+    def test_get_vocab(self):
+        pass
+
+    # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters
+    def test_pretokenized_inputs(self):
+        pass
+
+    # tests all ids in vocab => vocab doesn't exist so unnecessary to test
+    def test_conversion_reversible(self):
+        pass
diff --git a/test_tokenization_camembert.py b/test_tokenization_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..371a5cc057fbfdae480bd68e65a187e8131f16ae
--- /dev/null
+++ b/test_tokenization_camembert.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import CamembertTokenizer, CamembertTokenizerFast
+from transformers.file_utils import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+SAMPLE_BPE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece_bpe.model")
+
+FRAMEWORK = "pt" if is_torch_available() else "tf"
+
+
+@require_sentencepiece
+@require_tokenizers
+class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = CamembertTokenizer
+    rust_tokenizer_class = CamembertTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>NOTUSED")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "<mask>")
+        self.assertEqual(len(vocab_keys), 1_004)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_005)
+
+    def test_rust_and_python_bpe_tokenizers(self):
+        tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+        rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        # <unk> tokens are not the same for `rust` than for `slow`.
+        # Because spm gives back raw token instead of `unk` in EncodeAsPieces
+        # tokens = tokenizer.tokenize(sequence)
+        tokens = tokenizer.convert_ids_to_tokens(ids)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # noqa: E501
+        # fmt: on
+
+        # camembert is a french model. So we also use french texts.
+        sequences = [
+            "Le transformeur est un modèle d'apprentissage profond introduit en 2017, "
+            "utilisé principalement dans le domaine du traitement automatique des langues (TAL).",
+            "À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus "
+            "pour gérer des données séquentielles, telles que le langage naturel, pour des tâches "
+            "telles que la traduction et la synthèse de texte.",
+        ]
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="camembert-base",
+            revision="3a0641d9a1aeb7e848a74299e7e4c4bca216b4cf",
+            sequences=sequences,
+        )
diff --git a/test_tokenization_clip.py b/test_tokenization_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f5ab7bd4a29ed6ad0211781c5925fc89d5788cf
--- /dev/null
+++ b/test_tokenization_clip.py
@@ -0,0 +1,208 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = CLIPTokenizer
+    rust_tokenizer_class = CLIPTokenizerFast
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {"add_prefix_space": True}
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # fmt: off
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|endoftext|>"]
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer "
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+
+        sequence = "lower newer"
+
+        # Testing tokenization
+        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        # Testing conversion to ids without special tokens
+        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        # Testing conversion to ids with special tokens
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+        ids = tokenizer.encode(sequence, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # Testing the unknown token
+        input_tokens = tokens + [rust_tokenizer.unk_token]
+        input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16]
+        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_pretokenized_inputs(self, *args, **kwargs):
+        # It's very difficult to mix/test pretokenization with byte-level
+        # And get both CLIP and Roberta to work at the same time (mostly an issue of adding a space before the string)
+        pass
+
+    def test_padding(self, max_length=15):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Simple input
+                s = "This is a simple input"
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+
+                # Simple input tests
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    s2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    p2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+    def test_add_tokens_tokenizer(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokens[-3])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                # padding is very hacky in CLIPTokenizer, pad_token_id is always 0
+                # so skip this check
+                # self.assertEqual(tokens[-2], tokenizer.pad_token_id)
diff --git a/test_tokenization_common.py b/test_tokenization_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5171b88d3bd400d5e788298bf6ab5eb9ed656af0
--- /dev/null
+++ b/test_tokenization_common.py
@@ -0,0 +1,3197 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import itertools
+import os
+import pickle
+import re
+import shutil
+import tempfile
+import unittest
+from collections import OrderedDict
+from itertools import takewhile
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
+
+from huggingface_hub import HfApi
+from requests.exceptions import HTTPError
+from transformers import (
+    BertTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+    is_tf_available,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    ENDPOINT_STAGING,
+    PASS,
+    USER,
+    get_tests_dir,
+    is_pt_tf_cross_test,
+    is_staging_test,
+    require_tf,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
+from transformers.tokenization_utils import AddedToken
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
+
+
+NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
+
+
+def filter_non_english(_, pretrained_name: str):
+    """Filter all the model for non-english language"""
+    return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
+
+
+def filter_roberta_detectors(_, pretrained_name: str):
+    return "detector" not in pretrained_name
+
+
+def merge_model_tokenizer_mappings(
+    model_mapping: Dict["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]],
+    tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]],
+) -> Dict[
+    Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"],
+    Tuple["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]],
+]:
+    configurations = list(model_mapping.keys())
+    model_tokenizer_mapping = OrderedDict([])
+
+    for configuration in configurations:
+        if configuration in model_mapping and configuration in tokenizer_mapping:
+            model = model_mapping[configuration]
+            tokenizer = tokenizer_mapping[configuration][0]
+            tokenizer_fast = tokenizer_mapping[configuration][1]
+
+            model_tokenizer_mapping.update({tokenizer: (configuration, model)})
+            if tokenizer_fast is not None:
+                model_tokenizer_mapping.update({tokenizer_fast: (configuration, model)})
+
+    return model_tokenizer_mapping
+
+
+class TokenizerTesterMixin:
+
+    tokenizer_class = None
+    rust_tokenizer_class = None
+    test_slow_tokenizer = True
+    test_rust_tokenizer = True
+    space_between_special_tokens = False
+    from_pretrained_kwargs = None
+    from_pretrained_filter = None
+    from_pretrained_vocab_key = "vocab_file"
+    test_seq2seq = True
+
+    # set to True to test a sentencepiece tokenizer
+    test_sentencepiece = False
+
+    # set to True to ignore casing when testing a sentencepiece tokenizer
+    # test_sentencepiece must also be set to True
+    test_sentencepiece_ignore_case = False
+
+    def setUp(self) -> None:
+        # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
+        # information available in Tokenizer (name, rust class, python class, vocab key name)
+        if self.test_rust_tokenizer:
+            tokenizers_list = [
+                (
+                    self.rust_tokenizer_class,
+                    pretrained_name,
+                    self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
+                )
+                for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[
+                    self.from_pretrained_vocab_key
+                ].keys()
+                if self.from_pretrained_filter is None
+                or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
+            ]
+            self.tokenizers_list = tokenizers_list[:1]  # Let's just test the first pretrained vocab for speed
+        else:
+            self.tokenizers_list = []
+        with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
+            self._data = f_data.read().replace("\n\n", "\n").strip()
+
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def get_input_output_texts(self, tokenizer):
+        input_txt = self.get_clean_sequence(tokenizer)[0]
+        return input_txt, input_txt
+
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
+        toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
+        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
+        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
+        if max_length is not None and len(toks) > max_length:
+            toks = toks[:max_length]
+        if min_length is not None and len(toks) < min_length and len(toks) > 0:
+            while len(toks) < min_length:
+                toks = toks + toks
+        # toks_str = [t[1] for t in toks]
+        toks_ids = [t[0] for t in toks]
+
+        # Ensure consistency
+        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
+        if " " not in output_txt and len(toks_ids) > 1:
+            output_txt = (
+                tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
+                + " "
+                + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
+            )
+        if with_prefix_space:
+            output_txt = " " + output_txt
+        output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
+        return output_txt, output_ids
+
+    def get_tokenizers(self, fast=True, **kwargs) -> List[PreTrainedTokenizerBase]:
+        if fast and self.test_rust_tokenizer and self.test_slow_tokenizer:
+            return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
+        elif fast and self.test_rust_tokenizer:
+            return [self.get_rust_tokenizer(**kwargs)]
+        elif self.test_slow_tokenizer:
+            return [self.get_tokenizer(**kwargs)]
+        else:
+            raise ValueError("This tokenizer class has no tokenizer to be tested.")
+
+    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
+        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tokenizer_integration_test_util(
+        self,
+        expected_encoding: Dict,
+        model_name: str,
+        revision: str = None,
+        sequences: List[str] = None,
+        decode_kwargs: Dict[str, Any] = None,
+        padding: bool = True,
+    ):
+        """
+        Util for integration test.
+
+        Text is tokenized and then reverted back to text. Both results are then checked.
+
+        Args:
+            expected_encoding:
+                The expected result of the tokenizer output.
+            model_name:
+                The model name of the tokenizer to load and use.
+            revision:
+                The full git revision number of the model. This is to pin the
+                tokenizer config and to avoid that tests start to fail if the
+                config gets changed upstream.
+            sequences:
+                Can overwrite the texts that are used to check the tokenizer.
+                This is useful if the tokenizer supports non english languages
+                like france.
+            decode_kwargs:
+                Additional args for the ``decode`` function which reverts the
+                tokenized text back to a string.
+            padding:
+                Activates and controls padding of the tokenizer.
+        """
+        decode_kwargs = {} if decode_kwargs is None else decode_kwargs
+
+        if sequences is None:
+            sequences = [
+                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
+                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
+                "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
+                "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
+                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+                "conditioning on both left and right context in all layers.",
+                "The quick brown fox jumps over the lazy dog.",
+            ]
+
+        if self.test_sentencepiece_ignore_case:
+            sequences = [sequence.lower() for sequence in sequences]
+
+        tokenizer_classes = [self.tokenizer_class]
+        if self.test_rust_tokenizer:
+            tokenizer_classes.append(self.rust_tokenizer_class)
+
+        for tokenizer_class in tokenizer_classes:
+            tokenizer = tokenizer_class.from_pretrained(
+                model_name,
+                revision=revision,  # to pin the tokenizer version
+            )
+
+            encoding = tokenizer(sequences, padding=padding)
+            decoded_sequences = [
+                tokenizer.decode(seq, skip_special_tokens=True, **decode_kwargs) for seq in encoding["input_ids"]
+            ]
+
+            encoding_data = encoding.data
+            self.assertDictEqual(encoding_data, expected_encoding)
+
+            for expected, decoded in zip(sequences, decoded_sequences):
+                if self.test_sentencepiece_ignore_case:
+                    expected = expected.lower()
+                self.assertEqual(expected, decoded)
+
+    def assert_padded_input_match(self, input_r: list, input_p: list, max_length: int, pad_token_id: int):
+        # Ensure we match max_length
+        self.assertEqual(len(input_r), max_length)
+        self.assertEqual(len(input_p), max_length)
+
+        # Ensure the number of padded tokens is the same
+        padded_tokens_r = list(takewhile(lambda i: i == pad_token_id, reversed(input_r)))
+        padded_tokens_p = list(takewhile(lambda i: i == pad_token_id, reversed(input_p)))
+        self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
+
+    def assert_batch_padded_input_match(
+        self,
+        input_r: dict,
+        input_p: dict,
+        max_length: int,
+        pad_token_id: int,
+        model_main_input_name: str = "input_ids",
+    ):
+        for i_r in input_r.values():
+            self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
+                len(i_r[1]), max_length
+            )
+            self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
+                len(i_r[1]), max_length
+            )
+
+        for i_r, i_p in zip(input_r[model_main_input_name], input_p[model_main_input_name]):
+            self.assert_padded_input_match(i_r, i_p, max_length, pad_token_id)
+
+        for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
+            self.assertSequenceEqual(i_r, i_p)
+
+    @staticmethod
+    def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences):
+        # Switch from batch_encode_plus format:   {'input_ids': [[...], [...]], ...}
+        # to the list of examples/ encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
+        return [
+            {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
+            for i in range(len(batch_encode_plus_sequences["input_ids"]))
+        ]
+
+    # TODO: this test could be extended to all tokenizers - not just the sentencepiece
+    def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
+        """Test ``_tokenize`` and ``convert_tokens_to_string``."""
+        if not self.test_sentencepiece:
+            return
+
+        tokenizer = self.get_tokenizer()
+        text = "This is text to test the tokenizer."
+
+        if self.test_sentencepiece_ignore_case:
+            text = text.lower()
+
+        tokens = tokenizer.tokenize(text)
+
+        self.assertTrue(len(tokens) > 0)
+
+        # check if converting back to original text works
+        reverse_text = tokenizer.convert_tokens_to_string(tokens)
+
+        if self.test_sentencepiece_ignore_case:
+            reverse_text = reverse_text.lower()
+
+        self.assertEqual(reverse_text, text)
+
+    def test_subword_regularization_tokenizer(self) -> None:
+        if not self.test_sentencepiece:
+            return
+
+        # Subword regularization is only available for the slow tokenizer.
+        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
+        tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
+
+        self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
+        self.assertIsNotNone(tokenizer.sp_model_kwargs)
+        self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
+        self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
+        self.check_subword_sampling(tokenizer)
+
+    def test_pickle_subword_regularization_tokenizer(self) -> None:
+        if not self.test_sentencepiece:
+            return
+
+        """Google pickle __getstate__ __setstate__ if you are struggling with this."""
+        # Subword regularization is only available for the slow tokenizer.
+        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
+        tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
+        tokenizer_bin = pickle.dumps(tokenizer)
+        del tokenizer
+        tokenizer_new = pickle.loads(tokenizer_bin)
+
+        self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
+        self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
+        self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
+        self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
+        self.check_subword_sampling(tokenizer_new)
+
+    def test_model_input_names_signature(self):
+        accepted_model_main_input_names = [
+            "input_ids",  # nlp models
+            "input_values",  # speech models
+        ]
+
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            # first name of model_input_names has to correspond to main model input name
+            # to make sure `tokenizer.pad(...)` works correctly
+            self.assertTrue(tokenizer.model_input_names[0] in accepted_model_main_input_names)
+
+    def test_rust_tokenizer_signature(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        signature = inspect.signature(self.rust_tokenizer_class.__init__)
+
+        self.assertIn("tokenizer_file", signature.parameters)
+        self.assertIsNone(signature.parameters["tokenizer_file"].default)
+
+    def test_tokenizer_slow_store_full_signature(self):
+        if not self.test_slow_tokenizer:
+            return
+
+        signature = inspect.signature(self.tokenizer_class.__init__)
+        tokenizer = self.get_tokenizer()
+
+        for parameter_name, parameter in signature.parameters.items():
+            if parameter.default != inspect.Parameter.empty:
+                self.assertIn(parameter_name, tokenizer.init_kwargs)
+
+    def test_tokenizer_fast_store_full_signature(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        signature = inspect.signature(self.rust_tokenizer_class.__init__)
+        tokenizer = self.get_rust_tokenizer()
+
+        for parameter_name, parameter in signature.parameters.items():
+            if parameter.default != inspect.Parameter.empty and parameter_name != "tokenizer_file":
+                self.assertIn(parameter_name, tokenizer.init_kwargs)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence, _ = self.get_input_output_texts(tokenizer)
+
+        # We don't have an exact equivalence on `tokenize()` between Rust and Slow
+        # Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
+        # tokens = tokenizer.tokenize(sequence)
+        # rust_tokens = rust_tokenizer.tokenize(sequence)
+        # self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=True)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=True)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_tokenizers_common_properties(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                attributes_list = [
+                    "bos_token",
+                    "eos_token",
+                    "unk_token",
+                    "sep_token",
+                    "pad_token",
+                    "cls_token",
+                    "mask_token",
+                ]
+                for attr in attributes_list:
+                    self.assertTrue(hasattr(tokenizer, attr))
+                    self.assertTrue(hasattr(tokenizer, attr + "_id"))
+
+                self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
+                self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
+
+                attributes_list = [
+                    "model_max_length",
+                    "init_inputs",
+                    "init_kwargs",
+                ]
+                if not isinstance(tokenizer, PreTrainedTokenizerFast):
+                    attributes_list += [
+                        "added_tokens_encoder",
+                        "added_tokens_decoder",
+                    ]
+                for attr in attributes_list:
+                    self.assertTrue(hasattr(tokenizer, attr))
+
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+
+                shutil.rmtree(tmpdirname)
+
+        tokenizers = self.get_tokenizers(model_max_length=42)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                tokenizer.add_tokens(["bim", "bambam"])
+                additional_special_tokens = tokenizer.additional_special_tokens
+                additional_special_tokens.append("new_additional_special_token")
+                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+                self.assertIn("bim", after_vocab)
+                self.assertIn("bambam", after_vocab)
+                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
+                self.assertEqual(after_tokenizer.model_max_length, 42)
+
+                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
+                self.assertEqual(tokenizer.model_max_length, 43)
+
+                shutil.rmtree(tmpdirname)
+
+        # Test that we can also use the non-legacy saving format for fast tokenizers
+        tokenizers = self.get_tokenizers(model_max_length=42)
+        for tokenizer in tokenizers:
+            if not tokenizer.is_fast:
+                continue
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                tokenizer.add_tokens(["bim", "bambam"])
+                additional_special_tokens = tokenizer.additional_special_tokens
+                additional_special_tokens.append("new_additional_special_token")
+                tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+                self.assertIn("bim", after_vocab)
+                self.assertIn("bambam", after_vocab)
+                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
+                self.assertEqual(after_tokenizer.model_max_length, 42)
+
+                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
+                self.assertEqual(tokenizer.model_max_length, 43)
+
+                shutil.rmtree(tmpdirname)
+
+    def test_pickle_tokenizer(self):
+        """Google pickle __getstate__ __setstate__ if you are struggling with this."""
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertIsNotNone(tokenizer)
+
+                text = "Munich and Berlin are nice cities"
+                subwords = tokenizer.tokenize(text)
+
+                filename = os.path.join(self.tmpdirname, "tokenizer.bin")
+                with open(filename, "wb") as handle:
+                    pickle.dump(tokenizer, handle)
+
+                with open(filename, "rb") as handle:
+                    tokenizer_new = pickle.load(handle)
+
+                subwords_loaded = tokenizer_new.tokenize(text)
+
+                self.assertListEqual(subwords, subwords_loaded)
+
+    @require_tokenizers
+    def test_pickle_added_tokens(self):
+        tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
+        tok2 = pickle.loads(pickle.dumps(tok1))
+
+        self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
+
+    def test_added_tokens_do_lower_case(self):
+        # TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens.
+        tokenizers = [self.get_tokenizer(do_lower_case=True)] if self.test_slow_tokenizer else []
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
+                    continue
+
+                special_token = tokenizer.all_special_tokens[0]
+
+                text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
+                text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
+
+                toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
+                added = tokenizer.add_tokens(new_toks)
+                self.assertEqual(added, 2)
+
+                toks = tokenizer.tokenize(text)
+                toks2 = tokenizer.tokenize(text2)
+
+                self.assertEqual(len(toks), len(toks2))
+                self.assertListEqual(toks, toks2)
+                if not isinstance(tokenizer, PreTrainedTokenizerFast):
+                    # Python tokenizers can have added tokens with spaces inside them
+                    # cf https://github.com/huggingface/tokenizers/issues/302
+                    self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
+
+                # Check that none of the special tokens are lowercased
+                sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
+                tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
+
+                for special_token in tokenizer.all_special_tokens:
+                    self.assertTrue(special_token in tokenized_sequence)
+
+        tokenizers = [self.get_tokenizer(do_lower_case=True)] if self.test_slow_tokenizer else []
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
+                    continue
+
+                special_token = tokenizer.all_special_tokens[0]
+
+                text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
+                text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
+
+                toks0 = tokenizer.tokenize(text)  # toks before adding new_toks
+
+                added = tokenizer.add_tokens(new_toks)
+                self.assertIn(added, [2, 4])
+
+                toks = tokenizer.tokenize(text)
+                toks2 = tokenizer.tokenize(text2)
+
+                self.assertEqual(len(toks), len(toks2))  # Length should still be the same
+                self.assertNotEqual(toks[1], toks2[1])  # But at least the first non-special tokens should differ
+                if not isinstance(tokenizer, PreTrainedTokenizerFast):
+                    # Python tokenizers can have added tokens with spaces inside them
+                    # cf https://github.com/huggingface/tokenizers/issues/302
+                    self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
+
+    def test_add_tokens_tokenizer(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokens[-3])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
+
+    def test_add_special_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_text, ids = self.get_clean_sequence(tokenizer)
+
+                special_token = "[SPECIAL_TOKEN]"
+
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+
+                text = tokenizer.decode(ids + encoded_special_token, clean_up_tokenization_spaces=False)
+                encoded = tokenizer.encode(text, add_special_tokens=False)
+
+                input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
+                special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
+                self.assertEqual(encoded, input_encoded + special_token_id)
+
+                decoded = tokenizer.decode(encoded, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+
+    def test_internal_consistency(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_text, output_text = self.get_input_output_texts(tokenizer)
+
+                tokens = tokenizer.tokenize(input_text)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
+                self.assertListEqual(ids, ids_2)
+
+                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+                self.assertNotEqual(len(tokens_2), 0)
+                text_2 = tokenizer.decode(ids)
+                self.assertIsInstance(text_2, str)
+
+                self.assertEqual(text_2, output_text)
+
+    @require_tokenizers
+    def test_encode_decode_with_spaces(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                # new_toks = ["[ABC]", "[DEF]"]  # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
+                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
+                tokenizer.add_tokens(new_toks)
+                input = "[ABC][DEF][ABC][DEF]"  # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+                if self.space_between_special_tokens:
+                    output = "[ABC] [DEF] [ABC] [DEF]"
+                else:
+                    output = input
+                encoded = tokenizer.encode(input, add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+                self.assertIn(decoded, [output, output.lower()])
+
+    def test_pretrained_model_lists(self):
+        # We should have at least one default checkpoint for each tokenizer
+        # We should specify the max input length as well (used in some part to list the pretrained checkpoints)
+        self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
+        self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
+        self.assertEqual(
+            len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]),
+            len(self.tokenizer_class.max_model_input_sizes),
+        )
+
+        weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
+        weights_lists_2 = []
+        for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
+            weights_lists_2.append(list(map_list.keys()))
+
+        for weights_list_2 in weights_lists_2:
+            self.assertListEqual(weights_list, weights_list_2)
+
+    def test_mask_output(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                if (
+                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
+                    and "token_type_ids" in tokenizer.model_input_names
+                ):
+                    seq_0 = "Test this method."
+                    seq_1 = "With these inputs."
+                    information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
+                    sequences, mask = information["input_ids"], information["token_type_ids"]
+                    self.assertEqual(len(sequences), len(mask))
+
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                seq_0 = "Test this method."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardless of whether the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(seq_0, return_token_type_ids=True)
+                self.assertIn(0, output["token_type_ids"])
+
+    def test_sequence_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            if not tokenizer.is_fast:
+                continue
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                seq_0 = "Test this method."
+                seq_1 = "With these inputs."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardless of whether the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(seq_0)
+                self.assertIn(0, output.sequence_ids())
+
+                output = tokenizer(seq_0, seq_1)
+                self.assertIn(0, output.sequence_ids())
+                self.assertIn(1, output.sequence_ids())
+
+                if tokenizer.num_special_tokens_to_add(pair=True):
+                    self.assertIn(None, output.sequence_ids())
+
+    def test_number_of_added_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                seq_0 = "Test this method."
+                seq_1 = "With these inputs."
+
+                sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
+                attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+
+                # Method is implemented (e.g. not GPT-2)
+                if len(attached_sequences) != 2:
+                    self.assertEqual(
+                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
+                    )
+
+    def test_maximum_encoding_length_single_input(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
+
+                sequence = tokenizer.encode(seq_0, add_special_tokens=False)
+                total_length = len(sequence)
+
+                assert total_length > 4, "Issue with the testing sequence, please update it it's too short"
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_1 = seq_0 * model_max_length
+
+                sequence1 = tokenizer(seq_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                assert (
+                    total_length1 > model_max_length
+                ), "Issue with the testing sequence, please update it it's too short"
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"Truncation: {truncation_state}"):
+                                output = tokenizer(seq_1, padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                                output = tokenizer([seq_1], padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        # Reset warnings
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer(seq_1, padding=padding_state, truncation=False)
+                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer([seq_1], padding=padding_state, truncation=False)
+                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+
+                # Overflowing tokens
+                stride = 2
+                information = tokenizer(
+                    seq_0,
+                    max_length=total_length - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation="longest_first",
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    truncated_sequence = information["input_ids"][0]
+                    overflowing_tokens = information["input_ids"][1]
+                    self.assertEqual(len(information["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), total_length - 2)
+                    self.assertEqual(truncated_sequence, sequence[:-2])
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
+                else:
+                    truncated_sequence = information["input_ids"]
+                    overflowing_tokens = information["overflowing_tokens"]
+
+                    self.assertEqual(len(truncated_sequence), total_length - 2)
+                    self.assertEqual(truncated_sequence, sequence[:-2])
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+
+    def test_maximum_encoding_length_pair_input(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Build a sequence from our model's vocabulary
+                stride = 2
+                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
+                if len(ids) <= 2 + stride:
+                    seq_0 = (seq_0 + " ") * (2 + stride)
+                    ids = None
+
+                seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
+                assert len(seq0_tokens) > 2 + stride
+
+                seq_1 = "This is another sentence to be encoded."
+                seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
+                if abs(len(seq0_tokens) - len(seq1_tokens)) <= 2:
+                    seq1_tokens = seq1_tokens + seq1_tokens
+                    seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False)
+                seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
+
+                assert len(seq1_tokens) > 2 + stride
+
+                smallest = seq1_tokens if len(seq0_tokens) > len(seq1_tokens) else seq0_tokens
+
+                # We are not using the special tokens - a bit too hard to test all the tokenizers with this
+                # TODO try this again later
+                sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)  # , add_prefix_space=False)
+
+                # Test with max model input length
+                model_max_length = tokenizer.model_max_length
+                self.assertEqual(model_max_length, 100)
+                seq_2 = seq_0 * model_max_length
+                assert len(seq_2) > model_max_length
+
+                sequence1 = tokenizer(seq_1, add_special_tokens=False)
+                total_length1 = len(sequence1["input_ids"])
+                sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False)
+                total_length2 = len(sequence2["input_ids"])
+                assert total_length1 < model_max_length - 10, "Issue with the testing sequence, please update it."
+                assert total_length2 > model_max_length, "Issue with the testing sequence, please update it."
+
+                # Simple
+                padding_strategies = (
+                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
+                )
+                for padding_state in padding_strategies:
+                    with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
+                        for truncation_state in [True, "longest_first", "only_first"]:
+                            with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
+                                output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
+                                self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                                output = tokenizer(
+                                    [seq_2], [seq_1], padding=padding_state, truncation=truncation_state
+                                )
+                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple
+                        output = tokenizer(seq_1, seq_2, padding=padding_state, truncation="only_second")
+                        self.assertEqual(len(output["input_ids"]), model_max_length)
+
+                        output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation="only_second")
+                        self.assertEqual(len(output["input_ids"][0]), model_max_length)
+
+                        # Simple with no truncation
+                        # Reset warnings
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer(seq_1, seq_2, padding=padding_state, truncation=False)
+                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+
+                        tokenizer.deprecation_warnings = {}
+                        with self.assertLogs("transformers", level="WARNING") as cm:
+                            output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation=False)
+                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
+                        self.assertEqual(len(cm.records), 1)
+                        self.assertTrue(
+                            cm.records[0].message.startswith(
+                                "Token indices sequence length is longer than the specified maximum sequence length for this model"
+                            )
+                        )
+
+                truncated_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[:-2] + tokenizer.encode(
+                    seq_1, add_special_tokens=False
+                )
+                truncated_second_sequence = (
+                    tokenizer.encode(seq_0, add_special_tokens=False)
+                    + tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
+                )
+                truncated_longest_sequence = (
+                    truncated_first_sequence if len(seq0_tokens) > len(seq1_tokens) else truncated_second_sequence
+                )
+
+                overflow_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[
+                    -(2 + stride) :
+                ] + tokenizer.encode(seq_1, add_special_tokens=False)
+                overflow_second_sequence = (
+                    tokenizer.encode(seq_0, add_special_tokens=False)
+                    + tokenizer.encode(seq_1, add_special_tokens=False)[-(2 + stride) :]
+                )
+                overflow_longest_sequence = (
+                    overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence
+                )
+
+                information = tokenizer.encode_plus(
+                    seq_0,
+                    seq_1,
+                    max_length=len(sequence) - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation="longest_first",
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    truncated_sequence = information["input_ids"][0]
+                    overflowing_tokens = information["input_ids"][1]
+                    self.assertEqual(len(information["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
+                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
+                else:
+                    truncated_sequence = information["input_ids"]
+                    overflowing_tokens = information["overflowing_tokens"]
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+
+                    self.assertEqual(
+                        len(overflowing_tokens), 2 + stride
+                    )  # No overflowing tokens when using 'longest' in python tokenizers
+
+                information = tokenizer.encode_plus(
+                    seq_0,
+                    seq_1,
+                    max_length=len(sequence) - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation=True,
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    truncated_sequence = information["input_ids"][0]
+                    overflowing_tokens = information["input_ids"][1]
+                    self.assertEqual(len(information["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
+                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
+                else:
+                    truncated_sequence = information["input_ids"]
+                    overflowing_tokens = information["overflowing_tokens"]
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
+
+                    self.assertEqual(
+                        len(overflowing_tokens), 2 + stride
+                    )  # No overflowing tokens when using 'longest' in python tokenizers
+
+                information_first_truncated = tokenizer.encode_plus(
+                    seq_0,
+                    seq_1,
+                    max_length=len(sequence) - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation="only_first",
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    truncated_sequence = information_first_truncated["input_ids"][0]
+                    overflowing_tokens = information_first_truncated["input_ids"][1]
+                    self.assertEqual(len(information_first_truncated["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_first_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_tokens))
+                    self.assertEqual(overflowing_tokens, overflow_first_sequence)
+                else:
+                    truncated_sequence = information_first_truncated["input_ids"]
+                    overflowing_tokens = information_first_truncated["overflowing_tokens"]
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_first_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :])
+
+                information_second_truncated = tokenizer.encode_plus(
+                    seq_0,
+                    seq_1,
+                    max_length=len(sequence) - 2,
+                    add_special_tokens=False,
+                    stride=stride,
+                    truncation="only_second",
+                    return_overflowing_tokens=True,
+                    # add_prefix_space=False,
+                )
+                # Overflowing tokens are handled quite differently in slow and fast tokenizers
+                if isinstance(tokenizer, PreTrainedTokenizerFast):
+                    truncated_sequence = information_second_truncated["input_ids"][0]
+                    overflowing_tokens = information_second_truncated["input_ids"][1]
+                    self.assertEqual(len(information_second_truncated["input_ids"]), 2)
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_second_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_tokens))
+                    self.assertEqual(overflowing_tokens, overflow_second_sequence)
+                else:
+                    truncated_sequence = information_second_truncated["input_ids"]
+                    overflowing_tokens = information_second_truncated["overflowing_tokens"]
+
+                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+                    self.assertEqual(truncated_sequence, truncated_second_sequence)
+
+                    self.assertEqual(len(overflowing_tokens), 2 + stride)
+                    self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])
+
+    # def test_encode_input_type(self):
+    #     tokenizers = self.get_tokenizers(do_lower_case=False)
+    #     for tokenizer in tokenizers:
+    #         with self.subTest(f"{tokenizer.__class__.__name__}"):
+    #             sequence = "Let's encode this sequence"
+
+    #             tokens = sequence.split()  # tokenizer.tokenize(sequence)
+    #             # input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    #             formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
+
+    #             self.assertEqual(
+    #                 tokenizer.encode(tokens, is_split_into_words=True, add_special_tokens=True), formatted_input
+    #             )
+    #             # This is not supported with the Rust tokenizers
+    #             # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
+
+    # def test_swap_special_token(self):
+    #     tokenizers = self.get_tokenizers(do_lower_case=False)
+    #     for tokenizer in tokenizers:
+    #         with self.subTest(f"{tokenizer.__class__.__name__}"):
+    #             # Our mask token
+    #             mask = "<mask>"
+    #             # We take a single word in the middle of the vocabulary
+    #             all_tokens = sorted(tokenizer.get_vocab().keys())
+    #             word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
+
+    #             sequence_0 = "Encode " + word + " sequence"
+    #             sequence_masked_0 = "Encode " + mask + " sequence"
+
+    #             sequence_1 = word + " this sequence"
+    #             sequence_masked_1 = mask + " this sequence"
+
+    #             # Add tokens so that masked token isn't split
+    #             # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
+    #             # tokenizer.add_tokens(tokens)
+    #             tokenizer.add_special_tokens(
+    #                 {"mask_token": AddedToken(mask, normalized=False)}
+    #             )  # Eat left space on Byte-level BPE tokenizers
+    #             mask_ind = tokenizer.convert_tokens_to_ids(mask)
+
+    #             # Test first masked sequence
+    #             encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
+    #             encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
+    #             assert len(encoded_masked) == len(encoded_0)
+    #             mask_loc = encoded_masked.index(mask_ind)
+    #             encoded_masked[mask_loc] = encoded_0[mask_loc]
+
+    #             self.assertEqual(encoded_masked, encoded_0)
+
+    #             # Test second masked sequence
+    #             encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
+    #             encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
+    #             assert len(encoded_masked) == len(encoded_1)
+    #             mask_loc = encoded_masked.index(mask_ind)
+    #             encoded_masked[mask_loc] = encoded_1[mask_loc]
+
+    #             self.assertEqual(encoded_masked, encoded_1)
+
+    def test_special_tokens_mask(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                # Testing single inputs
+                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    sequence_0, add_special_tokens=True, return_special_tokens_mask=True  # , add_prefix_space=False
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    def test_special_tokens_mask_input_pairs(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                sequence_1 = "This one too please."
+                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
+                encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    sequence_0,
+                    sequence_1,
+                    add_special_tokens=True,
+                    return_special_tokens_mask=True,
+                    # add_prefix_space=False,
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [
+                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+                ]
+                filtered_sequence = [x for x in filtered_sequence if x is not None]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    def test_right_and_left_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                assert sequence_length + padding_size == padded_sequence_length
+                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+
+                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "left"
+                encoded_sequence = tokenizer.encode(sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                assert sequence_length + padding_size == padded_sequence_length
+                assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
+
+                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
+                encoded_sequence = tokenizer.encode(sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(sequence, padding=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                assert sequence_length == padded_sequence_right_length
+                assert encoded_sequence == padded_sequence_right
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(sequence, padding="longest")
+                padded_sequence_left_length = len(padded_sequence_left)
+                assert sequence_length == padded_sequence_left_length
+                assert encoded_sequence == padded_sequence_left
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(sequence)
+                padded_sequence_right_length = len(padded_sequence_right)
+                assert sequence_length == padded_sequence_right_length
+                assert encoded_sequence == padded_sequence_right
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(sequence, padding=False)
+                padded_sequence_left_length = len(padded_sequence_left)
+                assert sequence_length == padded_sequence_left_length
+                assert encoded_sequence == padded_sequence_left
+
+    def test_padding_to_max_length(self):
+        """We keep this test for backward compatibility but it should be remove when `pad_to_max_length` will e deprecated"""
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(sequence)
+                sequence_length = len(encoded_sequence)
+                # FIXME: the next line should be padding(max_length) to avoid warning
+                padded_sequence = tokenizer.encode(
+                    sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
+                )
+                padded_sequence_length = len(padded_sequence)
+                assert sequence_length + padding_size == padded_sequence_length
+                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+
+                # Check that nothing is done when a maximum length is not specified
+                encoded_sequence = tokenizer.encode(sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                assert sequence_length == padded_sequence_right_length
+                assert encoded_sequence == padded_sequence_right
+
+    def test_padding_to_multiple_of(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if tokenizer.pad_token is None:
+                    self.skipTest("No padding token.")
+                else:
+                    empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
+                    normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
+                    for key, value in empty_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    normal_tokens = tokenizer("This", pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    # Should also work with truncation
+                    normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    # truncation to something which is not a multiple of pad_to_multiple_of raises an error
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.__call__,
+                        "This",
+                        padding=True,
+                        truncation=True,
+                        max_length=12,
+                        pad_to_multiple_of=8,
+                    )
+
+    def test_encode_plus_with_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence = "Sequence"
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_size = 10
+                padding_idx = tokenizer.pad_token_id
+                token_type_padding_idx = tokenizer.pad_token_type_id
+
+                encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
+                input_ids = encoded_sequence["input_ids"]
+                special_tokens_mask = encoded_sequence["special_tokens_mask"]
+                sequence_length = len(input_ids)
+
+                # Test 'longest' and 'no_padding' don't do anything
+                tokenizer.padding_side = "right"
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    sequence,
+                    padding=True,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                assert sequence_length == not_padded_sequence_length
+                assert input_ids == not_padded_input_ids
+                assert special_tokens_mask == not_padded_special_tokens_mask
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                assert sequence_length == not_padded_sequence_length
+                assert input_ids == not_padded_input_ids
+                assert special_tokens_mask == not_padded_special_tokens_mask
+
+                # Test right padding
+                tokenizer.padding_side = "right"
+
+                right_padded_sequence = tokenizer.encode_plus(
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                right_padded_input_ids = right_padded_sequence["input_ids"]
+
+                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
+                right_padded_sequence_length = len(right_padded_input_ids)
+
+                assert sequence_length + padding_size == right_padded_sequence_length
+                assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
+                assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
+
+                # Test left padding
+                tokenizer.padding_side = "left"
+                left_padded_sequence = tokenizer.encode_plus(
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                left_padded_input_ids = left_padded_sequence["input_ids"]
+                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
+                left_padded_sequence_length = len(left_padded_input_ids)
+
+                assert sequence_length + padding_size == left_padded_sequence_length
+                assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
+                assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
+
+                if "token_type_ids" in tokenizer.model_input_names:
+                    token_type_ids = encoded_sequence["token_type_ids"]
+                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
+                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
+
+                    assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
+                    assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
+
+                if "attention_mask" in tokenizer.model_input_names:
+                    attention_mask = encoded_sequence["attention_mask"]
+                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
+                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
+
+                    assert attention_mask + [0] * padding_size == right_padded_attention_mask
+                    assert [0] * padding_size + attention_mask == left_padded_attention_mask
+
+    def test_separate_tokenizers(self):
+        # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
+        # we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
+
+        tokenizers = self.get_tokenizers(random_argument=True)
+        new_tokenizers = self.get_tokenizers(random_argument=False)
+
+        for tokenizer, new_tokenizer in zip(tokenizers, new_tokenizers):
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                assert tokenizer.init_kwargs["random_argument"] is True
+                assert tokenizer.init_kwargs["random_argument"] is True
+                assert new_tokenizer.init_kwargs["random_argument"] is False
+
+    def test_get_vocab(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                vocab_dict = tokenizer.get_vocab()
+                self.assertIsInstance(vocab_dict, dict)
+                self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
+
+                vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
+                self.assertEqual(len(vocab), len(tokenizer))
+
+                tokenizer.add_tokens(["asdfasdfasdfasdf"])
+                vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
+                self.assertEqual(len(vocab), len(tokenizer))
+
+    def test_conversion_reversible(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                vocab = tokenizer.get_vocab()
+                for word, ind in vocab.items():
+                    if word == tokenizer.unk_token:
+                        continue
+                    self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
+                    self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                # Test not batched
+                encoded_sequences_1 = tokenizer.encode_plus(sequences[0])
+                encoded_sequences_2 = tokenizer(sequences[0])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test not batched pairs
+                encoded_sequences_1 = tokenizer.encode_plus(sequences[0], sequences[1])
+                encoded_sequences_2 = tokenizer(sequences[0], sequences[1])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test batched
+                encoded_sequences_1 = tokenizer.batch_encode_plus(sequences)
+                encoded_sequences_2 = tokenizer(sequences)
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test batched pairs
+                encoded_sequences_1 = tokenizer.batch_encode_plus(list(zip(sequences, sequences)))
+                encoded_sequences_2 = tokenizer(sequences, sequences)
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+    def test_batch_encode_plus_batch_sequence_length(self):
+        # Tests that all encoded values have the correct size
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                encoded_sequences = [tokenizer.encode_plus(sequence) for sequence in sequences]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(sequences, padding=False)
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+                maximum_length = len(
+                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
+                )
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences_padded = [
+                    tokenizer.encode_plus(sequence, max_length=maximum_length, padding="max_length")
+                    for sequence in sequences
+                ]
+
+                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(sequences, padding=True)
+                self.assertListEqual(
+                    encoded_sequences_padded,
+                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
+                )
+
+                # check 'longest' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(sequences, padding=True)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    sequences, max_length=maximum_length + 10, padding="longest"
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+                # check 'no_padding' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(sequences, padding=False)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    sequences, max_length=maximum_length + 10, padding=False
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+    @require_tokenizers
+    def test_added_token_serializable(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                new_token = AddedToken("new_token", lstrip=True)
+                tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    tokenizer.save_pretrained(tmp_dir_name)
+                    tokenizer.from_pretrained(tmp_dir_name)
+
+    def test_batch_encode_plus_padding(self):
+        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
+
+        # Right padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+        # Left padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                tokenizer.padding_side = "left"
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+    def test_pretokenized_inputs(self):
+        # Test when inputs are pretokenized
+
+        tokenizers = self.get_tokenizers(do_lower_case=False)  # , add_prefix_space=True)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
+                    continue
+
+                # Prepare a sequence from our tokenizer vocabulary
+                sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20)
+                # sequence = " " + sequence  # To be sure the byte-level tokenizers are feeling good
+                token_sequence = sequence.split()
+                # sequence_no_prefix_space = sequence.strip()
+
+                # Test encode for pretokenized inputs
+                output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=False)
+                output_sequence = tokenizer.encode(sequence, add_special_tokens=False)
+                self.assertEqual(output, output_sequence)
+
+                output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=True)
+                output_sequence = tokenizer.encode(sequence, add_special_tokens=True)
+                self.assertEqual(output, output_sequence)
+
+                # Test encode_plus for pretokenized inputs
+                output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=False)
+                output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=False)
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+                output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=True)
+                output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=True)
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+
+                # Test batch_encode_plus for pretokenized inputs
+                sequence_batch = [sequence.strip()] * 2 + [sequence.strip() + " " + sequence.strip()]
+                token_sequence_batch = [s.split() for s in sequence_batch]
+                sequence_batch_cleaned_up_spaces = [" " + " ".join(s) for s in token_sequence_batch]
+
+                output = tokenizer.batch_encode_plus(
+                    token_sequence_batch, is_split_into_words=True, add_special_tokens=False
+                )
+                output_sequence = tokenizer.batch_encode_plus(
+                    sequence_batch_cleaned_up_spaces, add_special_tokens=False
+                )
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+                output = tokenizer.batch_encode_plus(
+                    token_sequence_batch, is_split_into_words=True, add_special_tokens=True
+                )
+                output_sequence = tokenizer.batch_encode_plus(
+                    sequence_batch_cleaned_up_spaces, add_special_tokens=True
+                )
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+
+                # Test encode for pretokenized inputs pairs
+                output = tokenizer.encode(
+                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
+                )
+                output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=False)
+                self.assertEqual(output, output_sequence)
+                output = tokenizer.encode(
+                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
+                )
+                output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=True)
+                self.assertEqual(output, output_sequence)
+
+                # Test encode_plus for pretokenized inputs pairs
+                output = tokenizer.encode_plus(
+                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
+                )
+                output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=False)
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+                output = tokenizer.encode_plus(
+                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
+                )
+                output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=True)
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+
+                # Test batch_encode_plus for pretokenized inputs pairs
+                sequence_pair_batch = [(sequence.strip(), sequence.strip())] * 2 + [
+                    (sequence.strip() + " " + sequence.strip(), sequence.strip())
+                ]
+                token_sequence_pair_batch = [tuple(s.split() for s in pair) for pair in sequence_pair_batch]
+                sequence_pair_batch_cleaned_up_spaces = [
+                    tuple(" " + " ".join(s) for s in pair) for pair in token_sequence_pair_batch
+                ]
+
+                output = tokenizer.batch_encode_plus(
+                    token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=False
+                )
+                output_sequence = tokenizer.batch_encode_plus(
+                    sequence_pair_batch_cleaned_up_spaces, add_special_tokens=False
+                )
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+                output = tokenizer.batch_encode_plus(
+                    token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=True
+                )
+                output_sequence = tokenizer.batch_encode_plus(
+                    sequence_pair_batch_cleaned_up_spaces, add_special_tokens=True
+                )
+                for key in output.keys():
+                    self.assertEqual(output[key], output_sequence[key])
+
+    def test_prepare_for_model(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                string_sequence = "Testing the prepare_for_model method."
+                ids = tokenizer.encode(string_sequence, add_special_tokens=False)
+                prepared_input_dict = tokenizer.prepare_for_model(ids, add_special_tokens=True)
+
+                input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
+
+                self.assertEqual(input_dict, prepared_input_dict)
+
+    def test_batch_encode_plus_overflowing_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            string_sequences = ["Testing the prepare_for_model method.", "Test"]
+
+            if tokenizer.pad_token is None:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+            tokenizer.batch_encode_plus(
+                string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
+            )
+
+    @is_pt_tf_cross_test
+    def test_batch_encode_plus_tensors(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                # A Tensor cannot be build by sequences which are not the same size
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, sequences, return_tensors="pt")
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, sequences, return_tensors="tf")
+
+                if tokenizer.pad_token_id is None:
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        sequences,
+                        padding=True,
+                        return_tensors="pt",
+                    )
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        sequences,
+                        padding="longest",
+                        return_tensors="tf",
+                    )
+                else:
+                    pytorch_tensor = tokenizer.batch_encode_plus(sequences, padding=True, return_tensors="pt")
+                    tensorflow_tensor = tokenizer.batch_encode_plus(sequences, padding="longest", return_tensors="tf")
+                    encoded_sequences = tokenizer.batch_encode_plus(sequences, padding=True)
+
+                    for key in encoded_sequences.keys():
+                        pytorch_value = pytorch_tensor[key].tolist()
+                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
+                        encoded_value = encoded_sequences[key]
+
+                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
+    def _check_no_pad_token_padding(self, tokenizer, sequences):
+        # if tokenizer does not have pad_token_id, an error should be thrown
+        if tokenizer.pad_token_id is None:
+            with self.assertRaises(ValueError):
+                if isinstance(sequences, list):
+                    tokenizer.batch_encode_plus(sequences, padding="longest")
+                else:
+                    tokenizer.encode_plus(sequences, padding=True)
+
+            # add pad_token_id to pass subsequent tests
+            tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+
+    def check_subword_sampling(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        text: str = None,
+    ) -> None:
+        """
+        Check if the tokenizer generates different results when subword regularization is enabled.
+
+        Subword regularization augments training data with subword sampling.
+        This has a random component.
+
+        Args:
+            tokenizer: The tokenizer to check.
+            text: The text to use for the checks.
+        """
+        text = "This is a test for subword regularization." if text is None else text
+        if self.test_sentencepiece_ignore_case:
+            text = text.lower()
+
+        tokens_list = []
+        for _ in range(5):
+            tokens_list.append(tokenizer.tokenize(text))
+
+        # the list of different pairs of tokens_list
+        combinations = itertools.combinations(tokens_list, 2)
+
+        # check of sampling is done
+        subword_sampling_found = False
+        for combination in combinations:
+            if combination[0] != combination[1]:
+                subword_sampling_found = True
+        self.assertTrue(subword_sampling_found)
+
+        # check if converting back to original text works
+        for tokens in tokens_list:
+            if self.test_sentencepiece_ignore_case:
+                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
+            else:
+                self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
+
+    @require_torch
+    @slow
+    def test_torch_encode_plus_sent_to_model(self):
+        import torch
+
+        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+                    return
+
+                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+                config = config_class()
+
+                if config.is_encoder_decoder or config.pad_token_id is None:
+                    return
+
+                model = model_class(config)
+
+                # Make sure the model contains at least the full vocabulary size in its embedding matrix
+                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
+                assert (
+                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
+                    if is_using_common_embeddings
+                    else True
+                )
+
+                # Build sequence
+                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+                sequence = " ".join(first_ten_tokens)
+                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="pt")
+
+                # Ensure that the BatchEncoding.to() method works.
+                encoded_sequence.to(model.device)
+
+                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
+                # This should not fail
+
+                with torch.no_grad():  # saves some time
+                    model(**encoded_sequence)
+                    model(**batch_encoded_sequence)
+
+        # if self.test_rust_tokenizer:
+        #     fast_tokenizer = self.get_rust_tokenizer()
+        #     encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="pt")
+        #     batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
+        #     # This should not fail
+        #     model(**encoded_sequence_fast)
+        #     model(**batch_encoded_sequence_fast)
+
+    @require_tf
+    @slow
+    def test_tf_encode_plus_sent_to_model(self):
+        from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+                    return
+
+                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+                config = config_class()
+
+                if config.is_encoder_decoder or config.pad_token_id is None:
+                    return
+
+                model = model_class(config)
+
+                # Make sure the model contains at least the full vocabulary size in its embedding matrix
+                assert model.config.vocab_size >= len(tokenizer)
+
+                # Build sequence
+                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+                sequence = " ".join(first_ten_tokens)
+                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="tf")
+                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="tf")
+
+                # This should not fail
+                model(encoded_sequence)
+                model(batch_encoded_sequence)
+
+    # TODO: Check if require_torch is the best to test for numpy here ... Maybe move to require_flax when available
+    @require_torch
+    @slow
+    def test_np_encode_plus_sent_to_model(self):
+        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+                    return
+
+                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+                config = config_class()
+
+                if config.is_encoder_decoder or config.pad_token_id is None:
+                    return
+
+                # Build sequence
+                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+                sequence = " ".join(first_ten_tokens)
+                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="np")
+                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
+
+                # TODO: add forward through JAX/Flax when PR is merged
+                # This is currently here to make flake8 happy !
+                if encoded_sequence is None:
+                    raise ValueError("Cannot convert list to numpy tensor on  encode_plus()")
+
+                if batch_encoded_sequence is None:
+                    raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus()")
+
+                if self.test_rust_tokenizer:
+                    fast_tokenizer = self.get_rust_tokenizer()
+                    encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="np")
+                    batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus(
+                        [sequence, sequence], return_tensors="np"
+                    )
+
+                    # TODO: add forward through JAX/Flax when PR is merged
+                    # This is currently here to make flake8 happy !
+                    if encoded_sequence_fast is None:
+                        raise ValueError("Cannot convert list to numpy tensor on  encode_plus() (fast)")
+
+                    if batch_encoded_sequence_fast is None:
+                        raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus() (fast)")
+
+    @require_torch
+    def test_prepare_seq2seq_batch(self):
+        if not self.test_seq2seq:
+            return
+
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Longer text that will definitely require truncation.
+                src_text = [
+                    " UN Chief Says There Is No Military Solution in Syria",
+                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.",
+                ]
+                tgt_text = [
+                    "Şeful ONU declară că nu există o soluţie militară în Siria",
+                    "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei "
+                    'pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu '
+                    "vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
+                ]
+                try:
+                    batch = tokenizer.prepare_seq2seq_batch(
+                        src_texts=src_text,
+                        tgt_texts=tgt_text,
+                        max_length=3,
+                        max_target_length=10,
+                        return_tensors="pt",
+                        src_lang="en_XX",  # this should be ignored (for all but mbart) but not cause an error
+                    )
+                except NotImplementedError:
+                    return
+                self.assertEqual(batch.input_ids.shape[1], 3)
+                self.assertEqual(batch.labels.shape[1], 10)
+                # max_target_length will default to max_length if not specified
+                batch = tokenizer.prepare_seq2seq_batch(
+                    src_text, tgt_texts=tgt_text, max_length=3, return_tensors="pt"
+                )
+                self.assertEqual(batch.input_ids.shape[1], 3)
+                self.assertEqual(batch.labels.shape[1], 3)
+
+                batch_encoder_only = tokenizer.prepare_seq2seq_batch(
+                    src_texts=src_text, max_length=3, max_target_length=10, return_tensors="pt"
+                )
+                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
+                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
+                self.assertNotIn("decoder_input_ids", batch_encoder_only)
+
+    def test_is_fast(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                # Check is_fast is set correctly
+                self.assertTrue(tokenizer_r.is_fast)
+
+                if self.test_slow_tokenizer:
+                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                    self.assertFalse(tokenizer_p.is_fast)
+
+    def test_fast_only_inputs(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Ensure None raise an error
+                self.assertRaises(TypeError, tokenizer_r.tokenize, None)
+                self.assertRaises(TypeError, tokenizer_r.encode, None)
+                self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
+                self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
+
+    def test_alignement_methods(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
+                text = " ".join(words)
+                batch_size = 3
+
+                encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
+
+                batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
+                num_tokens = len(encoding["input_ids"])
+
+                last_word_index = len(words) - 1
+                last_token_index = num_tokens - 1
+                last_batch_index = batch_size - 1
+                last_char_index = len(text) - 1
+
+                # words, tokens
+                self.assertEqual(len(encoding.words(0)), num_tokens)
+                self.assertEqual(max(encoding.words(0)), last_word_index)
+                self.assertEqual(min(encoding.words(0)), 0)
+                self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
+                self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
+                self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
+                self.assertEqual(len(encoding.tokens(0)), num_tokens)
+
+                # Assert token_to_word
+                self.assertEqual(encoding.token_to_word(0), 0)
+                self.assertEqual(encoding.token_to_word(0, 0), 0)
+                self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
+                self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
+                self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
+                self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
+                self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
+
+                # Assert word_to_tokens
+                self.assertEqual(encoding.word_to_tokens(0).start, 0)
+                self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
+                self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
+                self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
+                self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
+                self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
+                self.assertEqual(
+                    batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
+                )
+
+                # Assert token_to_chars
+                self.assertEqual(encoding.token_to_chars(0).start, 0)
+                self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
+                self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
+                self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
+                self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
+                self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
+                self.assertEqual(
+                    batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
+                )
+
+                # Assert char_to_token
+                self.assertEqual(encoding.char_to_token(0), 0)
+                self.assertEqual(encoding.char_to_token(0, 0), 0)
+                self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
+                self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
+                self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
+                self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
+                self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
+
+                # Assert char_to_word
+                self.assertEqual(encoding.char_to_word(0), 0)
+                self.assertEqual(encoding.char_to_word(0, 0), 0)
+                self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
+                self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
+                self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
+                self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
+                self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
+
+                # Assert word_to_chars
+                self.assertEqual(encoding.word_to_chars(0).start, 0)
+                self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
+                self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
+                self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
+                self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
+                self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
+                self.assertEqual(
+                    batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
+                )
+
+                # Assert token_to_sequence
+                self.assertEqual(encoding.token_to_sequence(num_tokens // 2), 0)
+                self.assertEqual(encoding.token_to_sequence(0, num_tokens // 2), 0)
+                self.assertEqual(batch_encoding.token_to_sequence(1, num_tokens // 2), 0)
+                self.assertEqual(batch_encoding.token_to_sequence(0, num_tokens // 2), 0)
+                self.assertEqual(batch_encoding.token_to_sequence(last_batch_index, num_tokens // 2), 0)
+
+                # Pair of input sequences
+
+                words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
+                text = " ".join(words)
+                pair_words = ["Amazing", "example", "full", "of", "inspiration"]
+                pair_text = " ".join(pair_words)
+                batch_size = 3
+                index_word_in_first_seq = words.index("inspiration")
+                index_word_in_pair_seq = pair_words.index("inspiration")
+                index_char_in_first_seq = text.find("inspiration")
+                index_char_in_pair_seq = pair_text.find("inspiration")
+
+                pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=False)
+
+                pair_batch_encoding = tokenizer_r.batch_encode_plus(
+                    [(text, pair_text)] * batch_size, add_special_tokens=False
+                )
+                num_tokens = len(encoding["input_ids"])
+
+                last_word_index = len(words) - 1
+                last_token_index = num_tokens - 1
+                last_batch_index = batch_size - 1
+                last_char_index = len(text) - 1
+
+                # Assert word_to_tokens
+                self.assertNotEqual(
+                    pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start,
+                    pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    pair_encoding["input_ids"][
+                        pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start
+                    ],
+                    pair_encoding["input_ids"][
+                        pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start
+                    ],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start,
+                    pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start
+                    ],
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start
+                    ],
+                )
+
+                # Assert char_to_token
+                self.assertNotEqual(
+                    pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0),
+                    pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0)],
+                    pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1)],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0),
+                    pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0)
+                    ],
+                    pair_batch_encoding["input_ids"][1][
+                        pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1)
+                    ],
+                )
+
+                # Assert char_to_word
+                self.assertNotEqual(
+                    pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0),
+                    pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    words[pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0)],
+                    pair_words[pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1)],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0),
+                    pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1),
+                )
+                self.assertEqual(
+                    words[pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0)],
+                    pair_words[pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1)],
+                )
+
+                # Assert word_to_chars
+                self.assertNotEqual(
+                    pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start,
+                    pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    text[pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start],
+                    pair_text[pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start],
+                )
+                self.assertNotEqual(
+                    pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start,
+                    pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start,
+                )
+                self.assertEqual(
+                    text[pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start],
+                    pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start],
+                )
+
+                # Assert token_to_sequence
+                pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=True)
+
+                pair_sequence_ids = [
+                    pair_encoding.token_to_sequence(i) for i in range(len(pair_encoding["input_ids"]))
+                ]
+                self.assertIn(0, pair_sequence_ids)
+                self.assertIn(1, pair_sequence_ids)
+                if tokenizer_r.num_special_tokens_to_add(pair=True):
+                    self.assertIn(None, pair_sequence_ids)
+
+                pair_batch_encoding = tokenizer_r.batch_encode_plus(
+                    [(text, pair_text)] * batch_size, add_special_tokens=True
+                )
+                pair_batch_sequence_ids = [
+                    pair_batch_encoding.token_to_sequence(1, i)
+                    for i in range(len(pair_batch_encoding["input_ids"][0]))
+                ]
+                self.assertIn(0, pair_batch_sequence_ids)
+                self.assertIn(1, pair_batch_sequence_ids)
+                if tokenizer_r.num_special_tokens_to_add(pair=True):
+                    self.assertIn(None, pair_batch_sequence_ids)
+
+    def test_tokenization_python_rust_equals(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Ensure basic input match
+                input_p = tokenizer_p.encode_plus(self._data)
+                input_r = tokenizer_r.encode_plus(self._data)
+
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+                    self.assertSequenceEqual(input_p[key], input_r[key])
+
+                input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
+                input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
+
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
+
+                # Ensure truncation match
+                input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
+                input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
+
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+                    self.assertSequenceEqual(input_p[key], input_r[key])
+
+                # Ensure truncation with stride match
+                input_p = tokenizer_p.encode_plus(
+                    self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
+                )
+                input_r = tokenizer_r.encode_plus(
+                    self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
+                )
+
+                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
+                    self.assertSequenceEqual(input_p[key], input_r[key][0])
+
+    def test_num_special_tokens_to_add_equal(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Check we have the same number of added_tokens for both pair and non-pair inputs.
+                self.assertEqual(
+                    tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
+                )
+                self.assertEqual(
+                    tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
+                )
+
+    def test_max_length_equal(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Check we have the correct max_length for both pair and non-pair inputs.
+                self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
+                self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+
+    def test_special_tokens_map_equal(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Assert the set of special tokens match.
+                self.assertSequenceEqual(
+                    tokenizer_p.special_tokens_map.items(),
+                    tokenizer_r.special_tokens_map.items(),
+                )
+
+    def test_add_tokens(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                vocab_size = len(tokenizer_r)
+                self.assertEqual(tokenizer_r.add_tokens(""), 0)
+                self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
+                self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
+                self.assertEqual(len(tokenizer_r), vocab_size + 3)
+
+                self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
+                self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
+                self.assertRaises(
+                    AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
+                )
+                self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
+                self.assertEqual(
+                    tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
+                )
+                self.assertEqual(len(tokenizer_r), vocab_size + 8)
+
+    def test_offsets_mapping(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                text = "Wonderful no inspiration example with subtoken"
+                pair = "Along with an awesome pair"
+
+                # No pair
+                tokens_with_offsets = tokenizer_r.encode_plus(
+                    text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
+                )
+                added_tokens = tokenizer_r.num_special_tokens_to_add(False)
+                offsets = tokens_with_offsets["offset_mapping"]
+
+                # Assert there is the same number of tokens and offsets
+                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
+
+                # Assert there is online added_tokens special_tokens
+                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
+
+                # Pairs
+                tokens_with_offsets = tokenizer_r.encode_plus(
+                    text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
+                )
+                added_tokens = tokenizer_r.num_special_tokens_to_add(True)
+                offsets = tokens_with_offsets["offset_mapping"]
+
+                # Assert there is the same number of tokens and offsets
+                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
+
+                # Assert there is online added_tokens special_tokens
+                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
+
+    def test_batch_encode_dynamic_overflowing(self):
+        """
+        When calling batch_encode with multiple sequence it can returns different number of
+        overflowing encoding for each sequence:
+        [
+          Sequence 1: [Encoding 1, Encoding 2],
+          Sequence 2: [Encoding 1],
+          Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
+        ]
+        This needs to be padded so that it can represented as a tensor
+        """
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
+
+                if is_torch_available():
+                    returned_tensor = "pt"
+                elif is_tf_available():
+                    returned_tensor = "tf"
+                else:
+                    returned_tensor = "jax"
+
+                if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
+                    return
+
+                tokens = tokenizer.encode_plus(
+                    "HuggingFace is solving NLP one commit at a time",
+                    max_length=6,
+                    padding=True,
+                    truncation=True,
+                    return_tensors=returned_tensor,
+                    return_overflowing_tokens=True,
+                )
+
+                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
+                    self.assertEqual(len(tokens[key].shape), 2)
+
+                # Mono sample
+                tokens = tokenizer.batch_encode_plus(
+                    ["HuggingFace is solving NLP one commit at a time"],
+                    max_length=6,
+                    padding=True,
+                    truncation="only_first",
+                    return_tensors=returned_tensor,
+                    return_overflowing_tokens=True,
+                )
+
+                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
+                    self.assertEqual(len(tokens[key].shape), 2)
+                    self.assertEqual(tokens[key].shape[-1], 6)
+
+                # Multi sample
+                tokens = tokenizer.batch_encode_plus(
+                    ["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
+                    max_length=6,
+                    padding=True,
+                    truncation="only_first",
+                    return_tensors=returned_tensor,
+                    return_overflowing_tokens=True,
+                )
+
+                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
+                    self.assertEqual(len(tokens[key].shape), 2)
+                    self.assertEqual(tokens[key].shape[-1], 6)
+
+    def test_compare_pretokenized_inputs(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
+                    continue  # Too hard to test for now
+
+                # Input string
+                pretokenized_input_simple = "This is a sample input".split()
+                pretokenized_input_pair = "This is a sample pair".split()
+
+                # Test encode for pretokenized inputs
+                output_r = tokenizer_r.encode(
+                    pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
+                )
+                output_p = tokenizer_p.encode(
+                    pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
+                )
+                self.assertEqual(output_p, output_r)
+
+                kwargs = {
+                    "is_split_into_words": True,
+                    # "return_token_type_ids": True,  # Use the defaults for each tokenizers
+                    # "return_attention_mask": True,  # Use the defaults for each tokenizers
+                    "return_overflowing_tokens": False,
+                    "return_special_tokens_mask": True,
+                    "return_offsets_mapping": False,  # Not implemented in python tokenizers
+                    # "add_special_tokens": False,
+                }
+                batch_kwargs = {
+                    "is_split_into_words": True,
+                    # "return_token_type_ids": True,  # Use the defaults for each tokenizers
+                    # "return_attention_mask": True,  # Use the defaults for each tokenizers
+                    "return_overflowing_tokens": False,
+                    "return_special_tokens_mask": True,
+                    "return_offsets_mapping": False,  # Not implemented in python tokenizers
+                    # "add_special_tokens": False,
+                }
+                # Test encode_plus for pretokenized inputs
+                output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
+                output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
+                for key in output_p.keys():
+                    self.assertEqual(output_p[key], output_r[key])
+
+                # Test batch_encode_plus for pretokenized inputs
+                input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
+                output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
+                output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
+                for key in output_p.keys():
+                    self.assertEqual(output_p[key], output_r[key])
+
+                # Test encode for pretokenized inputs pairs
+                output_r = tokenizer_r.encode(
+                    pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
+                )
+                output_p = tokenizer_p.encode(
+                    pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
+                )
+                self.assertEqual(output_p, output_r)
+
+                # Test encode_plus for pretokenized inputs
+                output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
+                output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
+                for key in output_p.keys():
+                    self.assertEqual(output_p[key], output_r[key])
+
+                # Test batch_encode_plus for pretokenized inputs
+                input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
+                    pretokenized_input_simple + pretokenized_input_pair,
+                    pretokenized_input_pair,
+                ]
+                output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
+                output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
+                for key in output_p.keys():
+                    self.assertEqual(output_p[key], output_r[key])
+
+    def test_create_token_type_ids(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                input_simple = [1, 2, 3]
+                input_pair = [1, 2, 3]
+
+                # Generate output
+                output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
+                output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
+                self.assertEqual(output_p, output_r)
+
+                # Generate pair output
+                output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
+                output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
+                self.assertEqual(output_p, output_r)
+
+    def test_build_inputs_with_special_tokens(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                # # Input string
+                # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
+                # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
+
+                # # Generate output
+                # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
+                # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+                # self.assertEqual(output_p, output_r)
+
+                # # Generate pair output
+                # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
+                # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
+                # self.assertEqual(output_p, output_r)
+
+                # Input tokens id
+                input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
+                input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
+
+                # Generate output
+                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
+                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
+                self.assertEqual(output_p, output_r)
+
+                # Generate pair output
+                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
+                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
+                self.assertEqual(output_p, output_r)
+
+    def test_padding(self, max_length=50):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
+                pad_token_id = tokenizer_p.pad_token_id
+
+                # Encode - Simple input
+                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
+                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
+                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
+                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
+                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
+                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
+
+                input_r = tokenizer_r.encode("This is a simple input", padding="longest")
+                input_p = tokenizer_p.encode("This is a simple input", padding=True)
+                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
+
+                # Encode - Pair input
+                input_r = tokenizer_r.encode(
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                )
+                input_p = tokenizer_p.encode(
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                )
+                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
+                input_r = tokenizer_r.encode(
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                )
+                input_p = tokenizer_p.encode(
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                )
+                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
+                input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
+                input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
+                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
+
+                # Encode_plus - Simple input
+                input_r = tokenizer_r.encode_plus(
+                    "This is a simple input", max_length=max_length, pad_to_max_length=True
+                )
+                input_p = tokenizer_p.encode_plus(
+                    "This is a simple input", max_length=max_length, pad_to_max_length=True
+                )
+                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_r = tokenizer_r.encode_plus(
+                    "This is a simple input", max_length=max_length, padding="max_length"
+                )
+                input_p = tokenizer_p.encode_plus(
+                    "This is a simple input", max_length=max_length, padding="max_length"
+                )
+                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+
+                input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
+                input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
+                self.assert_padded_input_match(
+                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
+                )
+
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+
+                # Encode_plus - Pair input
+                input_r = tokenizer_r.encode_plus(
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                )
+                input_p = tokenizer_p.encode_plus(
+                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+                )
+                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_r = tokenizer_r.encode_plus(
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                )
+                input_p = tokenizer_p.encode_plus(
+                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
+                )
+                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+                input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
+                input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
+                self.assert_padded_input_match(
+                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
+                )
+                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
+
+                # Batch_encode_plus - Simple input
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
+                    max_length=max_length,
+                    pad_to_max_length=True,
+                )
+                input_p = tokenizer_p.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
+                    max_length=max_length,
+                    pad_to_max_length=True,
+                )
+                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
+
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
+                    max_length=max_length,
+                    padding="max_length",
+                )
+                input_p = tokenizer_p.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
+                    max_length=max_length,
+                    padding="max_length",
+                )
+                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
+
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
+                    max_length=max_length,
+                    padding="longest",
+                )
+                input_p = tokenizer_p.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"],
+                    max_length=max_length,
+                    padding=True,
+                )
+                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
+
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"], padding="longest"
+                )
+                input_p = tokenizer_p.batch_encode_plus(
+                    ["This is a simple input 1", "This is a simple input 2"], padding=True
+                )
+                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
+
+                # Batch_encode_plus - Pair input
+                input_r = tokenizer_r.batch_encode_plus(
+                    [
+                        ("This is a simple input 1", "This is a simple input 2"),
+                        ("This is a simple pair 1", "This is a simple pair 2"),
+                    ],
+                    max_length=max_length,
+                    truncation=True,
+                    padding="max_length",
+                )
+                input_p = tokenizer_p.batch_encode_plus(
+                    [
+                        ("This is a simple input 1", "This is a simple input 2"),
+                        ("This is a simple pair 1", "This is a simple pair 2"),
+                    ],
+                    max_length=max_length,
+                    truncation=True,
+                    padding="max_length",
+                )
+                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
+
+                input_r = tokenizer_r.batch_encode_plus(
+                    [
+                        ("This is a simple input 1", "This is a simple input 2"),
+                        ("This is a simple pair 1", "This is a simple pair 2"),
+                    ],
+                    padding=True,
+                )
+                input_p = tokenizer_p.batch_encode_plus(
+                    [
+                        ("This is a simple input 1", "This is a simple input 2"),
+                        ("This is a simple pair 1", "This is a simple pair 2"),
+                    ],
+                    padding="longest",
+                )
+                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
+
+                # Using pad on single examples after tokenization
+                input_r = tokenizer_r.encode_plus("This is a input 1")
+                input_r = tokenizer_r.pad(input_r)
+
+                input_p = tokenizer_r.encode_plus("This is a input 1")
+                input_p = tokenizer_r.pad(input_p)
+
+                self.assert_padded_input_match(
+                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
+                )
+
+                # Using pad on single examples after tokenization
+                input_r = tokenizer_r.encode_plus("This is a input 1")
+                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
+
+                input_p = tokenizer_r.encode_plus("This is a input 1")
+                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
+
+                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
+
+                # Using pad after tokenization
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
+                )
+                input_r = tokenizer_r.pad(input_r)
+
+                input_p = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
+                )
+                input_p = tokenizer_r.pad(input_p)
+
+                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
+
+                # Using pad after tokenization
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
+                )
+                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
+
+                input_p = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
+                )
+                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
+
+                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
+
+    def test_padding_different_model_input_name(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
+                pad_token_id = tokenizer_p.pad_token_id
+
+                input_r = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
+                )
+                input_p = tokenizer_r.batch_encode_plus(
+                    ["This is a input 1", "This is a much longer input whilch should be padded"]
+                )
+
+                # rename encoded batch to "inputs"
+                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
+                del input_r[tokenizer_r.model_input_names[0]]
+
+                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
+                del input_p[tokenizer_p.model_input_names[0]]
+
+                # Renaming `input_ids` to `inputs`
+                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
+                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
+
+                input_r = tokenizer_r.pad(input_r, padding="longest")
+                input_p = tokenizer_r.pad(input_p, padding="longest")
+
+                max_length = len(input_p["inputs"][0])
+                self.assert_batch_padded_input_match(
+                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
+                )
+
+    def test_save_pretrained(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it save with the same files + the tokenizer.json file for the fast one
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+                    # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
+                    # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
+
+                shutil.rmtree(tmpdirname2)
+
+                # Save tokenizer rust, legacy_format=True
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it save with the same files
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+                # Save tokenizer rust, legacy_format=False
+                tmpdirname2 = tempfile.mkdtemp()
+
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+
+                # Checks it saved the tokenizer.json file
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+
+                shutil.rmtree(tmpdirname2)
+
+    def test_embeded_special_tokens(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                sentence = "A, <mask> AllenNLP sentence."
+                tokens_r = tokenizer_r.encode_plus(
+                    sentence,
+                    add_special_tokens=True,
+                )
+                tokens_p = tokenizer_p.encode_plus(
+                    sentence,
+                    add_special_tokens=True,
+                )
+
+                for key in tokens_p.keys():
+                    self.assertEqual(tokens_r[key], tokens_p[key])
+
+                if "token_type_ids" in tokens_r:
+                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+
+                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+                self.assertSequenceEqual(tokens_r, tokens_p)
+
+    def test_compare_add_special_tokens(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
+                # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
+
+                for text in ["", " "]:
+                    # tokenize()
+                    no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
+                    self.assertEqual(
+                        len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
+                    )
+
+                    # encode()
+                    no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
+                    self.assertEqual(
+                        len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
+                    )
+
+                    # encode_plus()
+                    no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
+                    for key in no_special_tokens.keys():
+                        self.assertEqual(
+                            len(no_special_tokens[key]),
+                            len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
+                        )
+
+                    # # batch_encode_plus
+                    no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
+                    with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
+                    for key in no_special_tokens.keys():
+                        for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
+                            self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
+
+    def test_compare_prepare_for_model(self):
+        if not self.test_slow_tokenizer:
+            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
+            return
+
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                string_sequence = "Asserting that both tokenizers are equal"
+                python_output = tokenizer_p.prepare_for_model(
+                    tokenizer_p.encode(string_sequence, add_special_tokens=False)
+                )
+                rust_output = tokenizer_r.prepare_for_model(
+                    tokenizer_r.encode(string_sequence, add_special_tokens=False)
+                )
+                for key in python_output:
+                    self.assertEqual(python_output[key], rust_output[key])
+
+    def test_special_tokens_initialization(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+
+                added_tokens = [AddedToken("<special>", lstrip=True)]
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                )
+                r_output = tokenizer_r.encode("Hey this is a <special> token")
+
+                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
+
+                self.assertTrue(special_token_id in r_output)
+
+                if self.test_slow_tokenizer:
+                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
+                    )
+                    tokenizer_p = self.tokenizer_class.from_pretrained(
+                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                    )
+
+                    p_output = tokenizer_p.encode("Hey this is a <special> token")
+
+                    cr_output = tokenizer_cr.encode("Hey this is a <special> token")
+
+                    self.assertEqual(p_output, r_output)
+                    self.assertEqual(cr_output, r_output)
+                    self.assertTrue(special_token_id in p_output)
+                    self.assertTrue(special_token_id in cr_output)
+
+
+@is_staging_test
+class TokenizerPushToHubTester(unittest.TestCase):
+    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
+
+    @classmethod
+    def setUpClass(cls):
+        cls._api = HfApi(endpoint=ENDPOINT_STAGING)
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-tokenizer")
+        except HTTPError:
+            pass
+
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-tokenizer-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+            tokenizer.save_pretrained(
+                tmp_dir, push_to_hub=True, repo_name="test-tokenizer", use_auth_token=self._token
+            )
+
+            new_tokenizer = BertTokenizer.from_pretrained(f"{USER}/test-tokenizer")
+            self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
+
+    def test_push_to_hub_in_organization(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            vocab_file = os.path.join(tmp_dir, "vocab.txt")
+            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens]))
+            tokenizer = BertTokenizer(vocab_file)
+            tokenizer.save_pretrained(
+                tmp_dir,
+                push_to_hub=True,
+                repo_name="test-tokenizer-org",
+                use_auth_token=self._token,
+                organization="valid_org",
+            )
+
+            new_tokenizer = BertTokenizer.from_pretrained("valid_org/test-tokenizer-org")
+            self.assertDictEqual(new_tokenizer.vocab, tokenizer.vocab)
diff --git a/test_tokenization_cpm.py b/test_tokenization_cpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65e8f07528d0e1df808d2d7cd4b73890dd2ba71
--- /dev/null
+++ b/test_tokenization_cpm.py
@@ -0,0 +1,39 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from transformers.models.cpm.tokenization_cpm import CpmTokenizer
+from transformers.testing_utils import custom_tokenizers
+
+from .test_modeling_xlnet import XLNetModelTest
+
+
+@custom_tokenizers
+class CpmTokenizationTest(XLNetModelTest):
+    def test_pre_tokenization(self):
+        tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate")
+        text = "Hugging Face大法好，谁用谁知道。"
+        normalized_text = "Hugging Face大法好,谁用谁知道。<unk>"
+        bpe_tokens = "▁Hu gg ing ▁ ▂ ▁F ace ▁大法 ▁好 ▁ , ▁谁 ▁用 ▁谁 ▁知 道 ▁ 。".split()
+
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+
+        input_bpe_tokens = [13789, 13283, 1421, 8, 10, 1164, 13608, 16528, 63, 8, 9, 440, 108, 440, 121, 90, 8, 12, 0]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+        reconstructed_text = tokenizer.decode(input_bpe_tokens)
+        self.assertEqual(reconstructed_text, normalized_text)
diff --git a/test_tokenization_ctrl.py b/test_tokenization_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4cd52d60117a0d4798b8a714ebb817c45fb5097
--- /dev/null
+++ b/test_tokenization_ctrl.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = CTRLTokenizer
+    test_rust_tokenizer = False
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "adapt react readapt apt"
+        output_text = "adapt react readapt apt"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "adapt react readapt apt"
+        bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split()
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+
+        input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/test_tokenization_deberta.py b/test_tokenization_deberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..33bf5efe1aff7473a7a6ea00e6db1ef076b2c851
--- /dev/null
+++ b/test_tokenization_deberta.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2019 Hugging Face inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import DebertaTokenizer, DebertaTokenizerFast
+from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
+from transformers.testing_utils import slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = DebertaTokenizer
+    test_rust_tokenizer = True
+    rust_tokenizer_class = DebertaTokenizerFast
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "[UNK]",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "[UNK]"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+        text = "lower newer"
+        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/deberta-base")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_text_from_decode = tokenizer.encode(
+            "sequence builders", add_special_tokens=True, add_prefix_space=False
+        )
+        encoded_pair_from_decode = tokenizer.encode(
+            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
+        )
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
+
+    @slow
+    def test_tokenizer_integration(self):
+        tokenizer_classes = [self.tokenizer_class]
+        if self.test_rust_tokenizer:
+            tokenizer_classes.append(self.rust_tokenizer_class)
+
+        for tokenizer_class in tokenizer_classes:
+            tokenizer = tokenizer_class.from_pretrained("microsoft/deberta-base")
+
+            sequences = [
+                "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
+                "ALBERT incorporates two parameter reduction techniques",
+                "The first one is a factorized embedding parameterization. By decomposing the large vocabulary embedding matrix into two small matrices, we separate the size of the hidden layers from the size of vocabulary embedding.",
+            ]
+
+            encoding = tokenizer(sequences, padding=True)
+            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
+
+            # fmt: off
+            expected_encoding = {
+                'input_ids': [
+                    [1, 2118, 11126, 565, 35, 83, 25191, 163, 18854, 13, 12156, 12, 16101, 25376, 13807, 9, 22205, 27893, 1635, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 2118, 11126, 565, 24536, 80, 43797, 4878, 7373, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 133, 78, 65, 16, 10, 3724, 1538, 33183, 11303, 43797, 1938, 4, 870, 24165, 29105, 5, 739, 32644, 33183, 11303, 36173, 88, 80, 650, 7821, 45940, 6, 52, 2559, 5, 1836, 9, 5, 7397, 13171, 31, 5, 1836, 9, 32644, 33183, 11303, 4, 2]
+                ],
+                'token_type_ids': [
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                ],
+                'attention_mask': [
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+                ]
+            }
+            # fmt: on
+
+            expected_decoded_sequence = [
+                "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
+                "ALBERT incorporates two parameter reduction techniques",
+                "The first one is a factorized embedding parameterization. By decomposing the large vocabulary embedding matrix into two small matrices, we separate the size of the hidden layers from the size of vocabulary embedding.",
+            ]
+
+            self.assertDictEqual(encoding.data, expected_encoding)
+
+            for expected, decoded in zip(expected_decoded_sequence, decoded_sequences):
+                self.assertEqual(expected, decoded)
diff --git a/test_tokenization_deberta_v2.py b/test_tokenization_deberta_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f79903a3b56e95cf2a77e72537eaef42d321c68
--- /dev/null
+++ b/test_tokenization_deberta_v2.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2019 Hugging Face inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import DebertaV2Tokenizer
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = DebertaV2Tokenizer
+    rust_tokenizer_class = None
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+    test_sentencepiece_ignore_case = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "this is a test"
+        output_text = "this is a test"
+        return input_text, output_text
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<pad>")
+        self.assertEqual(vocab_keys[1], "<unk>")
+        self.assertEqual(vocab_keys[-1], "[PAD]")
+        self.assertEqual(len(vocab_keys), 30_001)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_full_tokenizer(self):
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁", "[UNK]", "his", "▁is", "▁a", "▁test"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [13, 1, 4398, 25, 21, 1289])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        # fmt: off
+        self.assertListEqual(
+            tokens,
+            ["▁", "[UNK]", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "[UNK]", "."],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
+        )
+        # fmt: on
+
+    def test_sequence_builders(self):
+        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        self.assertEqual([tokenizer.cls_token_id] + text + [tokenizer.sep_token_id], encoded_sentence)
+        self.assertEqual(
+            [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id],
+            encoded_pair,
+        )
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[1, 39867, 36, 19390, 486, 27, 35052, 81436, 18, 60685, 1225, 7, 35052, 81436, 18, 9367, 16899, 18, 15937, 53, 594, 773, 18, 16287, 30465, 36, 15937, 6, 41139, 38, 36979, 60763, 191, 6, 34132, 99, 6, 50538, 390, 43230, 6, 34132, 2779, 20850, 14, 699, 1072, 1194, 36, 382, 10901, 53, 7, 699, 1072, 2084, 36, 20422, 630, 53, 19, 105, 3049, 1896, 1053, 16899, 1506, 11, 37978, 4243, 7, 1237, 31869, 200, 16566, 654, 6, 35052, 81436, 7, 55630, 13593, 4, 2], [1, 26, 15011, 13, 667, 8, 1053, 18, 23611, 1237, 72356, 12820, 34, 104134, 1209, 35, 13313, 6627, 21, 202, 347, 7, 164, 2399, 11, 46, 4485, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 5, 1232, 2864, 15785, 14951, 105, 5, 8581, 1250, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="microsoft/deberta-v2-xlarge",
+            revision="ad6e42c1532ddf3a15c39246b63f5559d558b670",
+        )
diff --git a/test_tokenization_distilbert.py b/test_tokenization_distilbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fb380156055e9002c5926151a58ec08f3df8bcd
--- /dev/null
+++ b/test_tokenization_distilbert.py
@@ -0,0 +1,43 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from transformers import DistilBertTokenizer, DistilBertTokenizerFast
+from transformers.testing_utils import require_tokenizers, slow
+
+from .test_tokenization_bert import BertTokenizationTest
+
+
+@require_tokenizers
+class DistilBertTokenizationTest(BertTokenizationTest):
+
+    tokenizer_class = DistilBertTokenizer
+    rust_tokenizer_class = DistilBertTokenizerFast
+    test_rust_tokenizer = True
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
diff --git a/test_tokenization_dpr.py b/test_tokenization_dpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc5ccb319e78b61c5973666cb3bf2c3fdc041998
--- /dev/null
+++ b/test_tokenization_dpr.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from transformers import (
+    DPRContextEncoderTokenizer,
+    DPRContextEncoderTokenizerFast,
+    DPRQuestionEncoderTokenizer,
+    DPRQuestionEncoderTokenizerFast,
+    DPRReaderOutput,
+    DPRReaderTokenizer,
+    DPRReaderTokenizerFast,
+)
+from transformers.testing_utils import require_tokenizers, slow
+from transformers.tokenization_utils_base import BatchEncoding
+
+from .test_tokenization_bert import BertTokenizationTest
+
+
+@require_tokenizers
+class DPRContextEncoderTokenizationTest(BertTokenizationTest):
+
+    tokenizer_class = DPRContextEncoderTokenizer
+    rust_tokenizer_class = DPRContextEncoderTokenizerFast
+    test_rust_tokenizer = True
+
+
+@require_tokenizers
+class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
+
+    tokenizer_class = DPRQuestionEncoderTokenizer
+    rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
+    test_rust_tokenizer = True
+
+
+@require_tokenizers
+class DPRReaderTokenizationTest(BertTokenizationTest):
+
+    tokenizer_class = DPRReaderTokenizer
+    rust_tokenizer_class = DPRReaderTokenizerFast
+    test_rust_tokenizer = True
+
+    @slow
+    def test_decode_best_spans(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+
+        text_1 = tokenizer.encode("question sequence", add_special_tokens=False)
+        text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
+        text_3 = tokenizer.encode("text sequence " * 4, add_special_tokens=False)
+        input_ids = [[101] + text_1 + [102] + text_2 + [102] + text_3]
+        reader_input = BatchEncoding({"input_ids": input_ids})
+
+        start_logits = [[0] * len(input_ids[0])]
+        end_logits = [[0] * len(input_ids[0])]
+        relevance_logits = [0]
+        reader_output = DPRReaderOutput(start_logits, end_logits, relevance_logits)
+
+        start_index, end_index = 8, 9
+        start_logits[0][start_index] = 10
+        end_logits[0][end_index] = 10
+        predicted_spans = tokenizer.decode_best_spans(reader_input, reader_output)
+        self.assertEqual(predicted_spans[0].start_index, start_index)
+        self.assertEqual(predicted_spans[0].end_index, end_index)
+        self.assertEqual(predicted_spans[0].doc_id, 0)
+
+    @slow
+    def test_call(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+
+        text_1 = tokenizer.encode("question sequence", add_special_tokens=False)
+        text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
+        text_3 = tokenizer.encode("text sequence", add_special_tokens=False)
+        expected_input_ids = [101] + text_1 + [102] + text_2 + [102] + text_3
+        encoded_input = tokenizer(questions=["question sequence"], titles=["title sequence"], texts=["text sequence"])
+        self.assertIn("input_ids", encoded_input)
+        self.assertIn("attention_mask", encoded_input)
+        self.assertListEqual(encoded_input["input_ids"][0], expected_input_ids)
diff --git a/test_tokenization_fast.py b/test_tokenization_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..796a3f07c21e032cbe360485980639329d58218e
--- /dev/null
+++ b/test_tokenization_fast.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import PreTrainedTokenizerFast
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
+    rust_tokenizer_class = PreTrainedTokenizerFast
+    test_slow_tokenizer = False
+    test_rust_tokenizer = True
+    from_pretrained_vocab_key = "tokenizer_file"
+
+    def setUp(self):
+        self.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
+        super().setUp()
+        self.test_rust_tokenizer = True
+
+        self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})]
+
+        tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast")
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_pretrained_model_lists(self):
+        # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
+        # model
+        pass
+
+    def test_prepare_for_model(self):
+        # We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any
+        # model
+        pass
+
+    def test_rust_tokenizer_signature(self):
+        # PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
+        pass
diff --git a/test_tokenization_fsmt.py b/test_tokenization_fsmt.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c80ee3dfa5e75a97d86b65614bb06452283b26
--- /dev/null
+++ b/test_tokenization_fsmt.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers.file_utils import cached_property
+from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer
+from transformers.testing_utils import slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+# using a different tiny model than the one used for default params defined in init to ensure proper testing
+FSMT_TINY2 = "stas/tiny-wmt19-en-ru"
+
+
+class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = FSMTTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
+
+        self.langs = ["en", "ru"]
+        config = {
+            "langs": self.langs,
+            "src_vocab_size": 10,
+            "tgt_vocab_size": 20,
+        }
+
+        self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
+        self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
+        config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.src_vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.tgt_vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+        with open(config_file, "w") as fp:
+            fp.write(json.dumps(config))
+
+    @cached_property
+    def tokenizer_ru_en(self):
+        return FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en")
+
+    @cached_property
+    def tokenizer_en_ru(self):
+        return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru")
+
+    def test_online_tokenizer_config(self):
+        """this just tests that the online tokenizer files get correctly fetched and
+        loaded via its tokenizer_config.json and it's not slow so it's run by normal CI
+        """
+        tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2)
+        self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"])
+        self.assertEqual(tokenizer.src_vocab_size, 21)
+        self.assertEqual(tokenizer.tgt_vocab_size, 21)
+
+    def test_full_tokenizer(self):
+        """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
+        tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_ru_en
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == text + [2]
+        assert encoded_pair == text + [2] + text_2 + [2]
+
+    @slow
+    def test_match_encode_decode(self):
+        tokenizer_enc = self.tokenizer_en_ru
+        tokenizer_dec = self.tokenizer_ru_en
+
+        targets = [
+            [
+                "Here's a little song I wrote. Don't worry, be happy.",
+                [2470, 39, 11, 2349, 7222, 70, 5979, 7, 8450, 1050, 13160, 5, 26, 6445, 7, 2],
+            ],
+            ["This is it. No more. I'm done!", [132, 21, 37, 7, 1434, 86, 7, 70, 6476, 1305, 427, 2]],
+        ]
+
+        # if data needs to be recreated or added, run:
+        # import torch
+        # model = torch.hub.load("pytorch/fairseq", "transformer.wmt19.en-ru", checkpoint_file="model4.pt", tokenizer="moses", bpe="fastbpe")
+        # for src_text, _ in targets: print(f"""[\n"{src_text}",\n {model.encode(src_text).tolist()}\n],""")
+
+        for src_text, tgt_input_ids in targets:
+            encoded_ids = tokenizer_enc.encode(src_text, return_tensors=None)
+            self.assertListEqual(encoded_ids, tgt_input_ids)
+
+            # and decode backward, using the reversed languages model
+            decoded_text = tokenizer_dec.decode(encoded_ids, skip_special_tokens=True)
+            self.assertEqual(decoded_text, src_text)
+
+    @slow
+    def test_tokenizer_lower(self):
+        tokenizer = FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en", do_lower_case=True)
+        tokens = tokenizer.tokenize("USA is United States of America")
+        expected = ["us", "a</w>", "is</w>", "un", "i", "ted</w>", "st", "ates</w>", "of</w>", "am", "er", "ica</w>"]
+        self.assertListEqual(tokens, expected)
+
+    @unittest.skip("FSMTConfig.__init__  requires non-optional args")
+    def test_torch_encode_plus_sent_to_model(self):
+        pass
+
+    @unittest.skip("FSMTConfig.__init__  requires non-optional args")
+    def test_np_encode_plus_sent_to_model(self):
+        pass
diff --git a/test_tokenization_funnel.py b/test_tokenization_funnel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cb76a7ef07c08e087bf0ece1c1fe81a88596d9d
--- /dev/null
+++ b/test_tokenization_funnel.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers import FunnelTokenizer, FunnelTokenizerFast
+from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = FunnelTokenizer
+    rust_tokenizer_class = FunnelTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "<unk>",
+            "<cls>",
+            "<sep>",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            inputs = tokenizer("UNwant\u00E9d,running")
+            sentence_len = len(inputs["input_ids"]) - 1
+            self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len)
+
+            inputs = tokenizer("UNwant\u00E9d,running", "UNwant\u00E9d,running")
+            self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len + [1] * sentence_len)
diff --git a/test_tokenization_gpt2.py b/test_tokenization_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d70d8814ec39702ed84b8bb14e68e280aa5e1c8
--- /dev/null
+++ b/test_tokenization_gpt2.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import GPT2Tokenizer, GPT2TokenizerFast
+from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = GPT2Tokenizer
+    rust_tokenizer_class = GPT2TokenizerFast
+    test_rust_tokenizer = True
+    from_pretrained_kwargs = {"add_prefix_space": True}
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+            "<|endoftext|>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+
+        sequence = "lower newer"
+
+        # Testing tokenization
+        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        # Testing conversion to ids without special tokens
+        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        # Testing conversion to ids with special tokens
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+        ids = tokenizer.encode(sequence, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # Testing the unknown token
+        input_tokens = tokens + [rust_tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_pretokenized_inputs(self, *args, **kwargs):
+        # It's very difficult to mix/test pretokenization with byte-level
+        # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string)
+        pass
+
+    def test_padding(self, max_length=15):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Simple input
+                s = "This is a simple input"
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+
+                # Simple input tests
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    s2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    p2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+    # tokenizer has no padding token
+    def test_padding_different_model_input_name(self):
+        pass
diff --git a/test_tokenization_herbert.py b/test_tokenization_herbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8569406bf9f4859b020307eb96ec054ed2c0bbb
--- /dev/null
+++ b/test_tokenization_herbert.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, Allegro.pl and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import HerbertTokenizer, HerbertTokenizerFast
+from transformers.models.herbert.tokenization_herbert import VOCAB_FILES_NAMES
+from transformers.testing_utils import get_tests_dir, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = HerbertTokenizer
+    rust_tokenizer_class = HerbertTokenizerFast
+    test_rust_tokenizer = True
+
+    def setUp(self):
+        super().setUp()
+
+        # Use a simpler test file without japanese/chinese characters
+        with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
+            self._data = f_data.read().replace("\n\n", "\n").strip()
+
+        vocab = [
+            "<s>",
+            "</s>",
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            ",</w>",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(vocab_file=self.vocab_file, merges_file=self.merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [16, 17, 23]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "lower,newer"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("allegro/herbert-base-cased")
+
+        text = tokenizer.encode("konstruowanie sekwencji", add_special_tokens=False)
+        text_2 = tokenizer.encode("konstruowanie wielu sekwencji", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [0] + text + [2]
+        assert encoded_pair == [0] + text + [2] + text_2 + [2]
diff --git a/test_tokenization_layoutlm.py b/test_tokenization_layoutlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..79831cd30c4d95101bf3dc70991276d9cda357df
--- /dev/null
+++ b/test_tokenization_layoutlm.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
+from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = LayoutLMTokenizer
+    rust_tokenizer_class = LayoutLMTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+
+    def test_special_tokens_as_you_expect(self):
+        """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids"""
+        pass
diff --git a/test_tokenization_luke.py b/test_tokenization_luke.py
new file mode 100644
index 0000000000000000000000000000000000000000..84bf52a0f3b3d82938c799c838a785bde5b035fa
--- /dev/null
+++ b/test_tokenization_luke.py
@@ -0,0 +1,576 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import AddedToken, LukeTokenizer
+from transformers.testing_utils import require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class Luke(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = LukeTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {"cls_token": "<s>"}
+
+    def setUp(self):
+        super().setUp()
+
+        self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
+
+    def get_tokenizer(self, task=None, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained("studio-ousia/luke-base", task=task, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class.from_pretrained("studio-ousia/luke-base")
+        text = "lower newer"
+        bpe_tokens = ["lower", "\u0120newer"]
+        tokens = tokenizer.tokenize(text)  # , add_prefix_space=True)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [29668, 13964, 3]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def luke_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+
+        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
+        self.assertListEqual(
+            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("studio-ousia/luke-large")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_text_from_decode = tokenizer.encode(
+            "sequence builders", add_special_tokens=True, add_prefix_space=False
+        )
+        encoded_pair_from_decode = tokenizer.encode(
+            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
+        )
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
+
+    def test_space_encoding(self):
+        tokenizer = self.get_tokenizer()
+
+        sequence = "Encode this sequence."
+        space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]
+
+        # Testing encoder arguments
+        encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
+        self.assertNotEqual(first_char, space_encoding)
+
+        encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
+        self.assertEqual(first_char, space_encoding)
+
+        tokenizer.add_special_tokens({"bos_token": "<s>"})
+        encoded = tokenizer.encode(sequence, add_special_tokens=True)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
+        self.assertNotEqual(first_char, space_encoding)
+
+        # Testing spaces after special tokens
+        mask = "<mask>"
+        tokenizer.add_special_tokens(
+            {"mask_token": AddedToken(mask, lstrip=True, rstrip=False)}
+        )  # mask token has a left space
+        mask_ind = tokenizer.convert_tokens_to_ids(mask)
+
+        sequence = "Encode <mask> sequence"
+        sequence_nospace = "Encode <mask>sequence"
+
+        encoded = tokenizer.encode(sequence)
+        mask_loc = encoded.index(mask_ind)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
+        self.assertEqual(first_char, space_encoding)
+
+        encoded = tokenizer.encode(sequence_nospace)
+        mask_loc = encoded.index(mask_ind)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
+        self.assertNotEqual(first_char, space_encoding)
+
+    def test_pretokenized_inputs(self):
+        pass
+
+    def test_embeded_special_tokens(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                sentence = "A, <mask> AllenNLP sentence."
+                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+
+                # token_type_ids should put 0 everywhere
+                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+
+                # token_type_ids should put 0 everywhere
+                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+
+                # attention_mask should put 1 everywhere, so sum over length should be 1
+                self.assertEqual(
+                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
+                )
+
+                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+
+                # Rust correctly handles the space before the mask while python doesnt
+                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+
+                self.assertSequenceEqual(
+                    tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
+                )
+
+
+@require_torch
+class LukeTokenizerIntegrationTests(unittest.TestCase):
+    tokenizer_class = LukeTokenizer
+    from_pretrained_kwargs = {"cls_token": "<s>"}
+
+    def setUp(self):
+        super().setUp()
+
+    def test_single_text_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", return_token_type_ids=True)
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        entities = ["Ana Ivanovic", "Thursday", "Dummy Entity"]
+        spans = [(9, 21), (30, 38), (39, 42)]
+
+        encoding = tokenizer(sentence, entities=entities, entity_spans=spans, return_token_type_ids=True)
+
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed Ana Ivanovic said on Thursday she could hardly believe her luck.</s>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][3:6], spaces_between_special_tokens=False), " Ana Ivanovic"
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][8:9], spaces_between_special_tokens=False), " Thursday"
+        )
+        self.assertEqual(tokenizer.decode(encoding["input_ids"][9:10], spaces_between_special_tokens=False), " she")
+
+        self.assertEqual(
+            encoding["entity_ids"],
+            [
+                tokenizer.entity_vocab["Ana Ivanovic"],
+                tokenizer.entity_vocab["Thursday"],
+                tokenizer.entity_vocab["[UNK]"],
+            ],
+        )
+        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        )
+        # fmt: on
+
+    def test_single_text_only_entity_spans_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", return_token_type_ids=True)
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        spans = [(9, 21), (30, 38), (39, 42)]
+
+        encoding = tokenizer(sentence, entity_spans=spans, return_token_type_ids=True)
+
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed Ana Ivanovic said on Thursday she could hardly believe her luck.</s>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][3:6], spaces_between_special_tokens=False), " Ana Ivanovic"
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][8:9], spaces_between_special_tokens=False), " Thursday"
+        )
+        self.assertEqual(tokenizer.decode(encoding["input_ids"][9:10], spaces_between_special_tokens=False), " she")
+
+        mask_id = tokenizer.entity_vocab["[MASK]"]
+        self.assertEqual(encoding["entity_ids"], [mask_id, mask_id, mask_id])
+        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ],
+                [9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ]
+            ]
+        )
+        # fmt: on
+
+    def test_single_text_padding_pytorch_tensors(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", return_token_type_ids=True)
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        entities = ["Ana Ivanovic", "Thursday", "Dummy Entity"]
+        spans = [(9, 21), (30, 38), (39, 42)]
+
+        encoding = tokenizer(
+            sentence,
+            entities=entities,
+            entity_spans=spans,
+            return_token_type_ids=True,
+            padding="max_length",
+            max_length=30,
+            max_entity_length=16,
+            return_tensors="pt",
+        )
+
+        # test words
+        self.assertEqual(encoding["input_ids"].shape, (1, 30))
+        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
+        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
+
+        # test entities
+        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
+        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
+        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
+        self.assertEqual(encoding["entity_position_ids"].shape, (1, 16, tokenizer.max_mention_length))
+
+    def test_text_pair_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", return_token_type_ids=True)
+        sentence = "Top seed Ana Ivanovic said on Thursday"
+        sentence_pair = "She could hardly believe her luck."
+        entities = ["Ana Ivanovic", "Thursday"]
+        entities_pair = ["Dummy Entity"]
+        spans = [(9, 21), (30, 38)]
+        spans_pair = [(0, 3)]
+
+        encoding = tokenizer(
+            sentence,
+            sentence_pair,
+            entities=entities,
+            entities_pair=entities_pair,
+            entity_spans=spans,
+            entity_spans_pair=spans_pair,
+            return_token_type_ids=True,
+        )
+
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed Ana Ivanovic said on Thursday</s></s>She could hardly believe her luck.</s>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][3:6], spaces_between_special_tokens=False), " Ana Ivanovic"
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][8:9], spaces_between_special_tokens=False), " Thursday"
+        )
+        self.assertEqual(tokenizer.decode(encoding["input_ids"][11:12], spaces_between_special_tokens=False), "She")
+
+        self.assertEqual(
+            encoding["entity_ids"],
+            [
+                tokenizer.entity_vocab["Ana Ivanovic"],
+                tokenizer.entity_vocab["Thursday"],
+                tokenizer.entity_vocab["[UNK]"],
+            ],
+        )
+        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        )
+        # fmt: on
+
+    def test_text_pair_only_entity_spans_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", return_token_type_ids=True)
+        sentence = "Top seed Ana Ivanovic said on Thursday"
+        sentence_pair = "She could hardly believe her luck."
+        spans = [(9, 21), (30, 38)]
+        spans_pair = [(0, 3)]
+
+        encoding = tokenizer(
+            sentence,
+            sentence_pair,
+            entity_spans=spans,
+            entity_spans_pair=spans_pair,
+            return_token_type_ids=True,
+        )
+
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed Ana Ivanovic said on Thursday</s></s>She could hardly believe her luck.</s>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][3:6], spaces_between_special_tokens=False), " Ana Ivanovic"
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][8:9], spaces_between_special_tokens=False), " Thursday"
+        )
+        self.assertEqual(tokenizer.decode(encoding["input_ids"][11:12], spaces_between_special_tokens=False), "She")
+
+        mask_id = tokenizer.entity_vocab["[MASK]"]
+        self.assertEqual(encoding["entity_ids"], [mask_id, mask_id, mask_id])
+        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        )
+        # fmt: on
+
+    def test_text_pair_padding_pytorch_tensors(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", return_token_type_ids=True)
+        sentence = "Top seed Ana Ivanovic said on Thursday"
+        sentence_pair = "She could hardly believe her luck."
+        entities = ["Ana Ivanovic", "Thursday"]
+        entities_pair = ["Dummy Entity"]
+        spans = [(9, 21), (30, 38)]
+        spans_pair = [(0, 3)]
+
+        encoding = tokenizer(
+            sentence,
+            sentence_pair,
+            entities=entities,
+            entities_pair=entities_pair,
+            entity_spans=spans,
+            entity_spans_pair=spans_pair,
+            return_token_type_ids=True,
+            padding="max_length",
+            max_length=30,
+            max_entity_length=16,
+            return_tensors="pt",
+        )
+
+        # test words
+        self.assertEqual(encoding["input_ids"].shape, (1, 30))
+        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
+        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
+
+        # test entities
+        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
+        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
+        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
+        self.assertEqual(encoding["entity_position_ids"].shape, (1, 16, tokenizer.max_mention_length))
+
+    def test_entity_classification_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", task="entity_classification")
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
+        span = (39, 42)
+
+        encoding = tokenizer(sentence, entity_spans=[span], return_token_type_ids=True)
+
+        # test words
+        self.assertEqual(len(encoding["input_ids"]), 42)
+        self.assertEqual(len(encoding["attention_mask"]), 42)
+        self.assertEqual(len(encoding["token_type_ids"]), 42)
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed Ana Ivanovic said on Thursday<ent> she<ent> could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon.</s>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][9:12], spaces_between_special_tokens=False), "<ent> she<ent>"
+        )
+
+        # test entities
+        self.assertEqual(encoding["entity_ids"], [2])
+        self.assertEqual(encoding["entity_attention_mask"], [1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
+            ]
+        )
+        # fmt: on
+
+    def test_entity_classification_padding_pytorch_tensors(self):
+        tokenizer = LukeTokenizer.from_pretrained(
+            "studio-ousia/luke-base", task="entity_classification", return_token_type_ids=True
+        )
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
+        # entity information
+        span = (39, 42)
+
+        encoding = tokenizer(
+            sentence, entity_spans=[span], return_token_type_ids=True, padding="max_length", return_tensors="pt"
+        )
+
+        # test words
+        self.assertEqual(encoding["input_ids"].shape, (1, 512))
+        self.assertEqual(encoding["attention_mask"].shape, (1, 512))
+        self.assertEqual(encoding["token_type_ids"].shape, (1, 512))
+
+        # test entities
+        self.assertEqual(encoding["entity_ids"].shape, (1, 1))
+        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 1))
+        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 1))
+        self.assertEqual(
+            encoding["entity_position_ids"].shape, (1, tokenizer.max_entity_length, tokenizer.max_mention_length)
+        )
+
+    def test_entity_pair_classification_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained(
+            "studio-ousia/luke-base", task="entity_pair_classification", return_token_type_ids=True
+        )
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        # head and tail information
+        spans = [(9, 21), (39, 42)]
+
+        encoding = tokenizer(sentence, entity_spans=spans, return_token_type_ids=True)
+
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed<ent> Ana Ivanovic<ent> said on Thursday<ent2> she<ent2> could hardly believe her luck.</s>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][3:8], spaces_between_special_tokens=False),
+            "<ent> Ana Ivanovic<ent>",
+        )
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"][11:14], spaces_between_special_tokens=False), "<ent2> she<ent2>"
+        )
+
+        self.assertEqual(encoding["entity_ids"], [2, 3])
+        self.assertEqual(encoding["entity_attention_mask"], [1, 1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0, 0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        )
+        # fmt: on
+
+    def test_entity_pair_classification_padding_pytorch_tensors(self):
+        tokenizer = LukeTokenizer.from_pretrained(
+            "studio-ousia/luke-base", task="entity_pair_classification", return_token_type_ids=True
+        )
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        # head and tail information
+        spans = [(9, 21), (39, 42)]
+
+        encoding = tokenizer(
+            sentence,
+            entity_spans=spans,
+            return_token_type_ids=True,
+            padding="max_length",
+            max_length=30,
+            return_tensors="pt",
+        )
+
+        # test words
+        self.assertEqual(encoding["input_ids"].shape, (1, 30))
+        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
+        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
+
+        # test entities
+        self.assertEqual(encoding["entity_ids"].shape, (1, 2))
+        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 2))
+        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 2))
+        self.assertEqual(
+            encoding["entity_position_ids"].shape, (1, tokenizer.max_entity_length, tokenizer.max_mention_length)
+        )
+
+    def test_entity_span_classification_no_padding_or_truncation(self):
+        tokenizer = LukeTokenizer.from_pretrained(
+            "studio-ousia/luke-base", task="entity_span_classification", return_token_type_ids=True
+        )
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        spans = [(0, 8), (9, 21), (39, 42)]
+
+        encoding = tokenizer(sentence, entity_spans=spans, return_token_type_ids=True)
+
+        self.assertEqual(
+            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
+            "<s>Top seed Ana Ivanovic said on Thursday she could hardly believe her luck.</s>",
+        )
+
+        self.assertEqual(encoding["entity_ids"], [2, 2, 2])
+        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
+        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
+        # fmt: off
+        self.assertEqual(
+            encoding["entity_position_ids"],
+            [
+                [1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+                [9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+            ]
+        )
+        # fmt: on
+        self.assertEqual(encoding["entity_start_positions"], [1, 3, 9])
+        self.assertEqual(encoding["entity_end_positions"], [2, 5, 9])
+
+    def test_entity_span_classification_padding_pytorch_tensors(self):
+        tokenizer = LukeTokenizer.from_pretrained(
+            "studio-ousia/luke-base", task="entity_span_classification", return_token_type_ids=True
+        )
+        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
+        spans = [(0, 8), (9, 21), (39, 42)]
+
+        encoding = tokenizer(
+            sentence,
+            entity_spans=spans,
+            return_token_type_ids=True,
+            padding="max_length",
+            max_length=30,
+            max_entity_length=16,
+            return_tensors="pt",
+        )
+
+        # test words
+        self.assertEqual(encoding["input_ids"].shape, (1, 30))
+        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
+        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
+
+        # test entities
+        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
+        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
+        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
+        self.assertEqual(encoding["entity_position_ids"].shape, (1, 16, tokenizer.max_mention_length))
+        self.assertEqual(encoding["entity_start_positions"].shape, (1, 16))
+        self.assertEqual(encoding["entity_end_positions"].shape, (1, 16))
diff --git a/test_tokenization_lxmert.py b/test_tokenization_lxmert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a19ea8095dafa153e48769857b71dcb0f8b83487
--- /dev/null
+++ b/test_tokenization_lxmert.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2018 LXMERT Authors, The Hugging Face Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers import LxmertTokenizer, LxmertTokenizerFast
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = LxmertTokenizer
+    rust_tokenizer_class = LxmertTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
diff --git a/test_tokenization_m2m_100.py b/test_tokenization_m2m_100.py
new file mode 100644
index 0000000000000000000000000000000000000000..1466a45e8634d75bb2ea2e0ad47112f53b514506
--- /dev/null
+++ b/test_tokenization_m2m_100.py
@@ -0,0 +1,240 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from shutil import copyfile
+
+from transformers import M2M100Tokenizer, is_torch_available
+from transformers.file_utils import is_sentencepiece_available
+from transformers.testing_utils import nested_simplify, require_sentencepiece, require_tokenizers, require_torch, slow
+
+
+if is_sentencepiece_available():
+    from transformers.models.m2m_100.tokenization_m2m_100 import save_json, VOCAB_FILES_NAMES
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+if is_sentencepiece_available():
+    SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+if is_torch_available():
+    from transformers.models.m2m_100.modeling_m2m_100 import shift_tokens_right
+
+EN_CODE = 128022
+FR_CODE = 128028
+
+
+@require_sentencepiece
+class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = M2M100Tokenizer
+    test_rust_tokenizer = False
+    test_seq2seq = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        save_dir = Path(self.tmpdirname)
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+
+        tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        return (
+            "This is a test",
+            "This is a test",
+        )
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "</s>"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "</s>")
+        self.assertEqual(vocab_keys[1], "<unk>")
+        self.assertEqual(vocab_keys[-1], "<s>")
+        self.assertEqual(len(vocab_keys), 10)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 117)
+
+    @unittest.skip("Skip this test while all models are still to be uploaded.")
+    def test_pretrained_model_lists(self):
+        pass
+
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [2, 3, 4, 5, 6],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens([2, 3, 4, 5, 6])
+        self.assertListEqual(back_tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual(text, "This is a test")
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[128022, 110108, 397, 11, 38272, 2247, 124811, 285, 18105, 1586, 207, 7, 39534, 4428, 397, 1019, 18105, 1586, 207, 7, 41337, 16786, 241, 7, 20214, 17, 125690, 10398, 7, 44378, 58069, 68342, 7798, 7343, 11, 299, 33310, 4, 158, 37350, 94077, 4569, 299, 33310, 90, 4, 52840, 290, 4, 31270, 112, 299, 682, 4, 52840, 39953, 14079, 193, 52519, 90894, 17894, 120697, 11, 40445, 551, 17, 1019, 52519, 90894, 17756, 963, 11, 40445, 480, 17, 9792, 1120, 5173, 1393, 6240, 16786, 241, 120996, 28, 1245, 1393, 118240, 11123, 1019, 93612, 2691, 10618, 98058, 120409, 1928, 279, 4, 40683, 367, 178, 207, 1019, 103, 103121, 506, 65296, 5, 2], [128022, 21217, 367, 117, 125450, 128, 719, 7, 7308, 40, 93612, 12669, 1116, 16704, 71, 17785, 3699, 15592, 35, 144, 9584, 241, 11943, 713, 950, 799, 2247, 88427, 150, 149, 118813, 120706, 1019, 106906, 81518, 28, 1224, 22799, 397, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [128022, 1658, 123311, 5155, 5578, 4722, 279, 14947, 2366, 1120, 1197, 14, 1348, 9232, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="facebook/m2m100_418M",
+            revision="c168bae485c864188cf9aa0e4108b0b6934dc91e",
+        )
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class M2M100TokenizerIntegrationTest(unittest.TestCase):
+    checkpoint_name = "facebook/m2m100_418M"
+    src_text = [
+        "In my opinion, there are two levels of response from the French government.",
+        "NSA Affair Emphasizes Complete Lack of Debate on Intelligence",
+    ]
+    tgt_text = [
+        "Selon moi, il y a deux niveaux de réponse de la part du gouvernement français.",
+        "L'affaire NSA souligne l'absence totale de débat sur le renseignement",
+    ]
+
+    # fmt: off
+    expected_src_tokens = [EN_CODE, 593, 1949, 115781, 4, 71586, 4234, 60633, 126233, 432, 123808, 15592, 1197, 117132, 120618, 5, 2]
+    # fmt: on
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer: M2M100Tokenizer = M2M100Tokenizer.from_pretrained(
+            cls.checkpoint_name, src_lang="en", tgt_lang="fr"
+        )
+        cls.pad_token_id = 1
+        return cls
+
+    def check_language_codes(self):
+        self.assertEqual(self.tokenizer.get_lang_id("ar"), 128006)
+        self.assertEqual(self.tokenizer.get_lang_id("en"), 128022)
+        self.assertEqual(self.tokenizer.get_lang_id("ro"), 128076)
+        self.assertEqual(self.tokenizer.get_lang_id("mr"), 128063)
+
+    def test_tokenizer_batch_encode_plus(self):
+        self.tokenizer.src_lang = "en"
+        ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
+        self.assertListEqual(self.expected_src_tokens, ids)
+
+    def test_tokenizer_decode_ignores_language_codes(self):
+        self.assertIn(FR_CODE, self.tokenizer.all_special_ids)
+        # fmt: off
+        generated_ids = [FR_CODE, 5364, 82, 8642, 4, 294, 47, 8, 14028, 136, 3286, 9706, 6, 90797, 6, 144012, 162, 88128, 30061, 5, 2]
+        # fmt: on
+        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        expected_french = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
+        self.assertEqual(result, expected_french)
+        self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_special_tokens_unaffacted_by_save_load(self):
+        tmpdirname = tempfile.mkdtemp()
+        original_special_tokens = self.tokenizer.lang_token_to_id
+        self.tokenizer.save_pretrained(tmpdirname)
+        new_tok = M2M100Tokenizer.from_pretrained(tmpdirname)
+        self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens)
+
+    @require_torch
+    def test_batch_fairseq_parity(self):
+        self.tokenizer.src_lang = "en"
+        self.tokenizer.tgt_lang = "fr"
+
+        batch = self.tokenizer(self.src_text, padding=True, return_tensors="pt")
+        with self.tokenizer.as_target_tokenizer():
+            batch["labels"] = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt").input_ids
+
+        batch["decoder_input_ids"] = shift_tokens_right(
+            batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
+        )
+
+        for k in batch:
+            batch[k] = batch[k].tolist()
+        # batch = {k: v.tolist() for k,v in batch.items()}
+        # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
+        # batch.decoder_inputs_ids[0][0] ==
+        assert batch.input_ids[1][0] == EN_CODE
+        assert batch.input_ids[1][-1] == 2
+        assert batch.labels[1][0] == FR_CODE
+        assert batch.labels[1][-1] == 2
+        assert batch.decoder_input_ids[1][:2] == [2, FR_CODE]
+
+    @require_torch
+    def test_src_lang_setter(self):
+        self.tokenizer.src_lang = "mr"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
+        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+
+        self.tokenizer.src_lang = "zh"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
+        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+
+    @require_torch
+    def test_as_target_tokenizer(self):
+        self.tokenizer.tgt_lang = "mr"
+        with self.tokenizer.as_target_tokenizer():
+            self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
+            self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
+
+        self.tokenizer.tgt_lang = "zh"
+        with self.tokenizer.as_target_tokenizer():
+            self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
+            self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
+
+    @require_torch
+    def test_tokenizer_translation(self):
+        inputs = self.tokenizer._build_translation_inputs("A test", src_lang="en", tgt_lang="ar")
+
+        self.assertEqual(
+            nested_simplify(inputs),
+            {
+                # en_XX, A, test, EOS
+                "input_ids": [[128022, 58, 4183, 2]],
+                "attention_mask": [[1, 1, 1, 1]],
+                # ar_AR
+                "forced_bos_token_id": 128006,
+            },
+        )
diff --git a/test_tokenization_marian.py b/test_tokenization_marian.py
new file mode 100644
index 0000000000000000000000000000000000000000..557b7675b61af29a317b252936aab9f50b946e6a
--- /dev/null
+++ b/test_tokenization_marian.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from shutil import copyfile
+
+from transformers import BatchEncoding, MarianTokenizer
+from transformers.file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
+from transformers.testing_utils import require_sentencepiece, slow
+
+
+if is_sentencepiece_available():
+    from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+mock_tokenizer_config = {"target_lang": "fi", "source_lang": "en"}
+zh_code = ">>zh<<"
+ORG_NAME = "Helsinki-NLP/"
+
+if is_torch_available():
+    FRAMEWORK = "pt"
+elif is_tf_available():
+    FRAMEWORK = "tf"
+else:
+    FRAMEWORK = "jax"
+
+
+@require_sentencepiece
+class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = MarianTokenizer
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+        vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        save_dir = Path(self.tmpdirname)
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
+        save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
+
+        tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs) -> MarianTokenizer:
+        return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        return (
+            "This is a test",
+            "This is a test",
+        )
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "</s>"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "</s>")
+        self.assertEqual(vocab_keys[1], "<unk>")
+        self.assertEqual(vocab_keys[-1], "<pad>")
+        self.assertEqual(len(vocab_keys), 9)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 9)
+
+    def test_tokenizer_equivalence_en_de(self):
+        en_de_tokenizer = MarianTokenizer.from_pretrained(f"{ORG_NAME}opus-mt-en-de")
+        batch = en_de_tokenizer(["I am a small frog"], return_tensors=None)
+        self.assertIsInstance(batch, BatchEncoding)
+        expected = [38, 121, 14, 697, 38848, 0]
+        self.assertListEqual(expected, batch.input_ids[0])
+
+        save_dir = tempfile.mkdtemp()
+        en_de_tokenizer.save_pretrained(save_dir)
+        contents = [x.name for x in Path(save_dir).glob("*")]
+        self.assertIn("source.spm", contents)
+        MarianTokenizer.from_pretrained(save_dir)
+
+    def test_outputs_not_longer_than_maxlen(self):
+        tok = self.get_tokenizer()
+
+        batch = tok(
+            ["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
+        )
+        self.assertIsInstance(batch, BatchEncoding)
+        self.assertEqual(batch.input_ids.shape, (2, 512))
+
+    def test_outputs_can_be_shorter(self):
+        tok = self.get_tokenizer()
+        batch_smaller = tok(["I am a tiny frog", "I am a small frog"], padding=True, return_tensors=FRAMEWORK)
+        self.assertIsInstance(batch_smaller, BatchEncoding)
+        self.assertEqual(batch_smaller.input_ids.shape, (2, 10))
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[43495, 462, 20, 42164, 1369, 52, 464, 132, 1703, 492, 13, 7491, 38999, 6, 8, 464, 132, 1703, 492, 13, 4669, 37867, 13, 7525, 27, 1593, 988, 13, 33972, 7029, 6, 20, 8251, 383, 2, 270, 5866, 3788, 2, 2353, 8251, 12338, 2, 13958, 387, 2, 3629, 6953, 188, 2900, 2, 13958, 8011, 11501, 23, 8460, 4073, 34009, 20, 435, 11439, 27, 8, 8460, 4073, 6004, 20, 9988, 375, 27, 33, 266, 1945, 1076, 1350, 37867, 3288, 5, 577, 1076, 4374, 8, 5082, 5, 26453, 257, 556, 403, 2, 242, 132, 383, 316, 492, 8, 10767, 6, 316, 304, 4239, 3, 0], [148, 15722, 19, 1839, 12, 1350, 13, 22327, 5082, 5418, 47567, 35938, 59, 318, 19552, 108, 2183, 54, 14976, 4835, 32, 547, 1114, 8, 315, 2417, 5, 92, 19088, 3, 0, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100], [36, 6395, 12570, 39147, 11597, 6, 266, 4, 45405, 7296, 3, 0, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="Helsinki-NLP/opus-mt-en-de",
+            revision="1a8c2263da11e68e50938f97e10cd57820bd504c",
+            decode_kwargs={"use_source_tokenizer": True},
+        )
diff --git a/test_tokenization_mbart.py b/test_tokenization_mbart.py
new file mode 100644
index 0000000000000000000000000000000000000000..640aec60fd411e18b511eff0da79f27b086b24d5
--- /dev/null
+++ b/test_tokenization_mbart.py
@@ -0,0 +1,249 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+from transformers import SPIECE_UNDERLINE, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
+from transformers.testing_utils import nested_simplify, require_sentencepiece, require_tokenizers, require_torch
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+if is_torch_available():
+    from transformers.models.mbart.modeling_mbart import shift_tokens_right
+
+EN_CODE = 250004
+RO_CODE = 250020
+
+
+@require_sentencepiece
+@require_tokenizers
+class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = MBartTokenizer
+    rust_tokenizer_class = MBartTokenizerFast
+    test_rust_tokenizer = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_full_tokenizer(self):
+        tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [
+                value + tokenizer.fairseq_offset
+                for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4]
+                #                                       ^ unk: 2 + 1 = 3                  unk: 2 + 1 = 3 ^
+            ],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MBartEnroIntegrationTest(unittest.TestCase):
+    checkpoint_name = "facebook/mbart-large-en-ro"
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
+    ]
+    tgt_text = [
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
+        'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.',
+    ]
+    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, EN_CODE]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained(
+            cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO"
+        )
+        cls.pad_token_id = 1
+        return cls
+
+    def check_language_codes(self):
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ar_AR"], 250001)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["en_EN"], 250004)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ro_RO"], 250020)
+
+    def test_enro_tokenizer_batch_encode_plus(self):
+        ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
+        self.assertListEqual(self.expected_src_tokens, ids)
+
+    def test_enro_tokenizer_decode_ignores_language_codes(self):
+        self.assertIn(RO_CODE, self.tokenizer.all_special_ids)
+        generated_ids = [RO_CODE, 884, 9019, 96, 9, 916, 86792, 36, 18743, 15596, 5, 2]
+        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
+        self.assertEqual(result, expected_romanian)
+        self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_enro_tokenizer_truncation(self):
+        src_text = ["this is gunna be a long sentence " * 20]
+        assert isinstance(src_text[0], str)
+        desired_max_length = 10
+        ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0]
+        self.assertEqual(ids[-2], 2)
+        self.assertEqual(ids[-1], EN_CODE)
+        self.assertEqual(len(ids), desired_max_length)
+
+    def test_mask_token(self):
+        self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [250026, 250001])
+
+    def test_special_tokens_unaffacted_by_save_load(self):
+        tmpdirname = tempfile.mkdtemp()
+        original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
+        self.tokenizer.save_pretrained(tmpdirname)
+        new_tok = MBartTokenizer.from_pretrained(tmpdirname)
+        self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
+
+    @require_torch
+    def test_batch_fairseq_parity(self):
+        batch = self.tokenizer(self.src_text, padding=True)
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
+        labels = targets["input_ids"]
+        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
+
+        # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
+        assert batch.input_ids[1][-2:] == [2, EN_CODE]
+        assert batch.decoder_input_ids[1][0] == RO_CODE
+        assert batch.decoder_input_ids[1][-1] == 2
+        assert labels[1][-2:].tolist() == [2, RO_CODE]
+
+    @require_torch
+    def test_enro_tokenizer_prepare_batch(self):
+        batch = self.tokenizer(
+            self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
+        )
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(
+                self.tgt_text,
+                padding=True,
+                truncation=True,
+                max_length=len(self.expected_src_tokens),
+                return_tensors="pt",
+            )
+        labels = targets["input_ids"]
+        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+
+        self.assertIsInstance(batch, BatchEncoding)
+
+        self.assertEqual((2, 14), batch.input_ids.shape)
+        self.assertEqual((2, 14), batch.attention_mask.shape)
+        result = batch.input_ids.tolist()[0]
+        self.assertListEqual(self.expected_src_tokens, result)
+        self.assertEqual(2, batch.decoder_input_ids[0, -1])  # EOS
+        # Test that special tokens are reset
+        self.assertEqual(self.tokenizer.prefix_tokens, [])
+        self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id, EN_CODE])
+
+    def test_seq2seq_max_length(self):
+        batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
+        labels = targets["input_ids"]
+        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+
+        self.assertEqual(batch.input_ids.shape[1], 3)
+        self.assertEqual(batch.decoder_input_ids.shape[1], 10)
+
+    @require_torch
+    def test_tokenizer_translation(self):
+        inputs = self.tokenizer._build_translation_inputs("A test", src_lang="en_XX", tgt_lang="ar_AR")
+
+        self.assertEqual(
+            nested_simplify(inputs),
+            {
+                # A, test, EOS, en_XX
+                "input_ids": [[62, 3034, 2, 250004]],
+                "attention_mask": [[1, 1, 1, 1]],
+                # ar_AR
+                "forced_bos_token_id": 250001,
+            },
+        )
diff --git a/test_tokenization_mbart50.py b/test_tokenization_mbart50.py
new file mode 100644
index 0000000000000000000000000000000000000000..88a0c62da9dd59688e9be76727d17ffecda0ce6f
--- /dev/null
+++ b/test_tokenization_mbart50.py
@@ -0,0 +1,243 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+from transformers import SPIECE_UNDERLINE, BatchEncoding, MBart50Tokenizer, MBart50TokenizerFast, is_torch_available
+from transformers.testing_utils import nested_simplify, require_sentencepiece, require_tokenizers, require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+if is_torch_available():
+    from transformers.models.mbart.modeling_mbart import shift_tokens_right
+
+EN_CODE = 250004
+RO_CODE = 250020
+
+
+@require_sentencepiece
+@require_tokenizers
+class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = MBart50Tokenizer
+    rust_tokenizer_class = MBart50TokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "<mask>")
+        self.assertEqual(len(vocab_keys), 1_054)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_054)
+
+    def test_full_tokenizer(self):
+        tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", "."],
+            # fmt: on
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [
+                value + tokenizer.fairseq_offset
+                for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4]
+            ],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", "."],
+            # fmt: on
+        )
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[250004, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 5428, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 12399, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 2789, 1328, 4589, 42, 122009, 115774, 23, 805, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [250004, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [250004, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="facebook/mbart-large-50",
+            revision="d3913889c59cd5c9e456b269c376325eabad57e2",
+        )
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MBartOneToManyIntegrationTest(unittest.TestCase):
+    checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt"
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
+    ]
+    tgt_text = [
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
+        'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.',
+    ]
+    expected_src_tokens = [EN_CODE, 8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer: MBart50Tokenizer = MBart50Tokenizer.from_pretrained(
+            cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO"
+        )
+        cls.pad_token_id = 1
+        return cls
+
+    def check_language_codes(self):
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ar_AR"], 250001)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["en_EN"], 250004)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ro_RO"], 250020)
+        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["mr_IN"], 250038)
+
+    def test_tokenizer_batch_encode_plus(self):
+        ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
+        self.assertListEqual(self.expected_src_tokens, ids)
+
+    def test_tokenizer_decode_ignores_language_codes(self):
+        self.assertIn(RO_CODE, self.tokenizer.all_special_ids)
+        generated_ids = [RO_CODE, 884, 9019, 96, 9, 916, 86792, 36, 18743, 15596, 5, 2]
+        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
+        self.assertEqual(result, expected_romanian)
+        self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_tokenizer_truncation(self):
+        src_text = ["this is gunna be a long sentence " * 20]
+        assert isinstance(src_text[0], str)
+        desired_max_length = 10
+        ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0]
+        self.assertEqual(ids[0], EN_CODE)
+        self.assertEqual(ids[-1], 2)
+        self.assertEqual(len(ids), desired_max_length)
+
+    def test_mask_token(self):
+        self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [250053, 250001])
+
+    def test_special_tokens_unaffacted_by_save_load(self):
+        tmpdirname = tempfile.mkdtemp()
+        original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
+        self.tokenizer.save_pretrained(tmpdirname)
+        new_tok = MBart50Tokenizer.from_pretrained(tmpdirname)
+        self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
+
+    @require_torch
+    def test_batch_fairseq_parity(self):
+        batch = self.tokenizer(self.src_text, padding=True)
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
+        labels = targets["input_ids"]
+        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
+        labels = labels.tolist()
+
+        # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
+        assert batch.input_ids[1][0] == EN_CODE
+        assert batch.input_ids[1][-1] == 2
+        assert labels[1][0] == RO_CODE
+        assert labels[1][-1] == 2
+        assert batch.decoder_input_ids[1][:2] == [2, RO_CODE]
+
+    @require_torch
+    def test_tokenizer_prepare_batch(self):
+        batch = self.tokenizer(
+            self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
+        )
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(
+                self.tgt_text,
+                padding=True,
+                truncation=True,
+                max_length=len(self.expected_src_tokens),
+                return_tensors="pt",
+            )
+        labels = targets["input_ids"]
+        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+
+        self.assertIsInstance(batch, BatchEncoding)
+
+        self.assertEqual((2, 14), batch.input_ids.shape)
+        self.assertEqual((2, 14), batch.attention_mask.shape)
+        result = batch.input_ids.tolist()[0]
+        self.assertListEqual(self.expected_src_tokens, result)
+        self.assertEqual(2, batch.decoder_input_ids[0, 0])  # decoder_start_token_id
+        # Test that special tokens are reset
+        self.assertEqual(self.tokenizer.prefix_tokens, [EN_CODE])
+        self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+
+    def test_seq2seq_max_target_length(self):
+        batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
+        with self.tokenizer.as_target_tokenizer():
+            targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
+        labels = targets["input_ids"]
+        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+
+        self.assertEqual(batch.input_ids.shape[1], 3)
+        self.assertEqual(batch.decoder_input_ids.shape[1], 10)
+
+    @require_torch
+    def test_tokenizer_translation(self):
+        inputs = self.tokenizer._build_translation_inputs("A test", src_lang="en_XX", tgt_lang="ar_AR")
+
+        self.assertEqual(
+            nested_simplify(inputs),
+            {
+                # en_XX, A, test, EOS
+                "input_ids": [[250004, 62, 3034, 2]],
+                "attention_mask": [[1, 1, 1, 1]],
+                # ar_AR
+                "forced_bos_token_id": 250001,
+            },
+        )
diff --git a/test_tokenization_mpnet.py b/test_tokenization_mpnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..733b2891f8766782309fa654969fbcce848bdd39
--- /dev/null
+++ b/test_tokenization_mpnet.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, Microsoft Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers import MPNetTokenizerFast
+from transformers.models.mpnet.tokenization_mpnet import VOCAB_FILES_NAMES, MPNetTokenizer
+from transformers.testing_utils import require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = MPNetTokenizer
+    rust_tokenizer_class = MPNetTokenizerFast
+    test_rust_tokenizer = True
+    space_between_special_tokens = True
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/mpnet-base")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [0] + text + [2]
+        assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2]
diff --git a/test_tokenization_openai.py b/test_tokenization_openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a7568aa5a37e2da15e875ef350cf7df8bc8b45e
--- /dev/null
+++ b/test_tokenization_openai.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
+from transformers.models.openai.tokenization_openai import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = OpenAIGPTTokenizer
+    rust_tokenizer_class = OpenAIGPTTokenizerFast
+    test_rust_tokenizer = True
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+
+    def get_input_output_texts(self, tokenizer):
+        return "lower newer", "lower newer"
+
+    def test_full_tokenizer(self):
+        tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_padding(self, max_length=15):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Simple input
+                s = "This is a simple input"
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+
+                # Simple input tests
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    s2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    p2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+    # tokenizer has no padding token
+    def test_padding_different_model_input_name(self):
+        pass
diff --git a/test_tokenization_pegasus.py b/test_tokenization_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..583eace0f775cbe7e2cc2e20d8965c07e7047a2f
--- /dev/null
+++ b/test_tokenization_pegasus.py
@@ -0,0 +1,208 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import PegasusTokenizer, PegasusTokenizerFast
+from transformers.file_utils import cached_property
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = PegasusTokenizer
+    rust_tokenizer_class = PegasusTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    @cached_property
+    def _large_tokenizer(self):
+        return PegasusTokenizer.from_pretrained("google/pegasus-large")
+
+    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
+        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        return ("This is a test", "This is a test")
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "</s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<pad>")
+        self.assertEqual(vocab_keys[1], "</s>")
+        self.assertEqual(vocab_keys[-1], "v")
+        self.assertEqual(len(vocab_keys), 1_103)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
+
+    def test_mask_tokens_rust_pegasus(self):
+        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
+        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
+        raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
+        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        self.assertListEqual(py_ids, rust_ids)
+
+    def test_large_mask_tokens(self):
+        tokenizer = self._large_tokenizer
+        # <mask_1> masks whole sentence while <mask_2> masks single word
+        raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
+        desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
+        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
+        self.assertListEqual(desired_result, ids)
+
+    def test_large_tokenizer_settings(self):
+        tokenizer = self._large_tokenizer
+        # The tracebacks for the following asserts are **better** without messages or self.assertEqual
+        assert tokenizer.vocab_size == 96103
+        assert tokenizer.pad_token_id == 0
+        assert tokenizer.eos_token_id == 1
+        assert tokenizer.offset == 103
+        assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
+        assert tokenizer.unk_token == "<unk>"
+        assert tokenizer.model_max_length == 1024
+        raw_input_str = "To ensure a smooth flow of bank resolutions."
+        desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
+        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
+        self.assertListEqual(desired_result, ids)
+        assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]
+
+    @require_torch
+    def test_large_seq2seq_truncation(self):
+        src_texts = ["This is going to be way too long." * 150, "short example"]
+        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
+        batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
+        with self._large_tokenizer.as_target_tokenizer():
+            targets = self._large_tokenizer(
+                tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
+            )
+
+        assert batch.input_ids.shape == (2, 1024)
+        assert batch.attention_mask.shape == (2, 1024)
+        assert targets["input_ids"].shape == (2, 5)
+        assert len(batch) == 2  # input_ids, attention_mask.
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[38979, 143, 18485, 606, 130, 26669, 87686, 121, 54189, 1129, 111, 26669, 87686, 121, 9114, 14787, 121, 13249, 158, 592, 956, 121, 14621, 31576, 143, 62613, 108, 9688, 930, 43430, 11562, 62613, 304, 108, 11443, 897, 108, 9314, 17415, 63399, 108, 11443, 7614, 18316, 118, 4284, 7148, 12430, 143, 1400, 25703, 158, 111, 4284, 7148, 11772, 143, 21297, 1064, 158, 122, 204, 3506, 1754, 1133, 14787, 1581, 115, 33224, 4482, 111, 1355, 110, 29173, 317, 50833, 108, 20147, 94665, 111, 77198, 107, 1], [110, 62613, 117, 638, 112, 1133, 121, 20098, 1355, 79050, 13872, 135, 1596, 53541, 1352, 141, 13039, 5542, 124, 302, 518, 111, 268, 2956, 115, 149, 4427, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [139, 1235, 2799, 18289, 17780, 204, 109, 9474, 1296, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="google/bigbird-pegasus-large-arxiv",
+            revision="ba85d0851d708441f91440d509690f1ab6353415",
+        )
+
+
+@require_sentencepiece
+@require_tokenizers
+class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = PegasusTokenizer
+    rust_tokenizer_class = PegasusTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    @cached_property
+    def _large_tokenizer(self):
+        return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
+
+    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
+        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        return ("This is a test", "This is a test")
+
+    def test_mask_tokens_rust_pegasus(self):
+        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
+        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
+        raw_input_str = "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s> <pad> <pad> <pad>"
+        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        self.assertListEqual(py_ids, rust_ids)
+
+    @require_torch
+    def test_large_seq2seq_truncation(self):
+        src_texts = ["This is going to be way too long." * 1000, "short example"]
+        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
+        batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
+        with self._large_tokenizer.as_target_tokenizer():
+            targets = self._large_tokenizer(
+                tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
+            )
+
+        assert batch.input_ids.shape == (2, 4096)
+        assert batch.attention_mask.shape == (2, 4096)
+        assert targets["input_ids"].shape == (2, 5)
+        assert len(batch) == 2  # input_ids, attention_mask.
+
+    def test_equivalence_to_orig_tokenizer(self):
+        """
+        To run with original TF tokenizer:
+
+        !wget https://github.com/google-research/bigbird/raw/master/bigbird/vocab/pegasus.model
+        !pip install tensorflow-text
+
+        import tensorflow.compat.v2 as tf
+        import tensorflow_text as tft
+
+        VOCAB_FILE = "./pegasus.model"
+
+        tf.enable_v2_behavior()
+
+        test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
+        tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(VOCAB_FILE, "rb").read())
+
+        tokenizer.tokenize(test_str)
+        """
+
+        test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
+
+        token_ids = self._large_tokenizer(test_str).input_ids
+
+        self.assertListEqual(
+            token_ids,
+            [182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
+        )
diff --git a/test_tokenization_phobert.py b/test_tokenization_phobert.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5d42c8a2457086de64e4e3b0774643ff7b33852
--- /dev/null
+++ b/test_tokenization_phobert.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = PhobertTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["T@@", "i", "I", "R@@", "r", "e@@"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l à</w>"]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            for token in vocab_tokens:
+                fp.write(f"{token} {vocab_tokens[token]}\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "Tôi là VinAI Research"
+        output_text = "T<unk> i <unk> <unk> <unk> <unk> <unk> <unk> I Re<unk> e<unk> <unk> <unk> <unk>"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = PhobertTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "Tôi là VinAI Research"
+        bpe_tokens = "T@@ ô@@ i l@@ à V@@ i@@ n@@ A@@ I R@@ e@@ s@@ e@@ a@@ r@@ c@@ h".split()
+        tokens = tokenizer.tokenize(text)
+        print(tokens)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+
+        input_bpe_tokens = [4, 3, 5, 3, 3, 3, 3, 3, 3, 6, 7, 9, 3, 9, 3, 3, 3, 3, 3]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/test_tokenization_prophetnet.py b/test_tokenization_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c073304aa90223628f9faeac409d4142e21660fc
--- /dev/null
+++ b/test_tokenization_prophetnet.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers import BatchEncoding
+from transformers.models.bert.tokenization_bert import (
+    BasicTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.models.prophetnet.tokenization_prophetnet import VOCAB_FILES_NAMES, ProphetNetTokenizer
+from transformers.testing_utils import require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = ProphetNetTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    @require_torch
+    def test_prepare_batch(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/prophetnet-large-uncased")
+
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        expected_src_tokens = [1037, 2146, 20423, 2005, 7680, 7849, 3989, 1012, 102]
+        batch = tokenizer(src_text, padding=True, return_tensors="pt")
+        self.assertIsInstance(batch, BatchEncoding)
+        result = list(batch.input_ids.numpy()[0])
+        self.assertListEqual(expected_src_tokens, result)
+
+        self.assertEqual((2, 9), batch.input_ids.shape)
+        self.assertEqual((2, 9), batch.attention_mask.shape)
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/prophetnet-large-uncased")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == text + [102]
+        assert encoded_pair == text + [102] + text_2 + [102]
diff --git a/test_tokenization_rag.py b/test_tokenization_rag.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefe119e689166cd695d322855e5a6165e8ae71b
--- /dev/null
+++ b/test_tokenization_rag.py
@@ -0,0 +1,170 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+from unittest import TestCase
+
+from transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
+from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
+from transformers.models.bart.configuration_bart import BartConfig
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
+from transformers.models.dpr.configuration_dpr import DPRConfig
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
+from transformers.testing_utils import require_datasets, require_faiss, require_tokenizers, require_torch, slow
+
+
+if is_torch_available() and is_datasets_available() and is_faiss_available():
+    from transformers.models.rag.configuration_rag import RagConfig
+    from transformers.models.rag.tokenization_rag import RagTokenizer
+
+
+@require_faiss
+@require_datasets
+@require_torch
+class RagTokenizerTest(TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        self.retrieval_vector_size = 8
+
+        # DPR tok
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
+        os.makedirs(dpr_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+        # BART tok
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
+        os.makedirs(bart_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
+        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
+
+    def get_bart_tokenizer(self) -> BartTokenizer:
+        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    @require_tokenizers
+    def test_save_load_pretrained_with_saved_config(self):
+
+        save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
+        rag_config = RagConfig(question_encoder=DPRConfig().to_dict(), generator=BartConfig().to_dict())
+        rag_tokenizer = RagTokenizer(question_encoder=self.get_dpr_tokenizer(), generator=self.get_bart_tokenizer())
+        rag_config.save_pretrained(save_dir)
+        rag_tokenizer.save_pretrained(save_dir)
+        new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
+        self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast)
+        self.assertEqual(new_rag_tokenizer.question_encoder.get_vocab(), rag_tokenizer.question_encoder.get_vocab())
+        self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizerFast)
+        self.assertEqual(new_rag_tokenizer.generator.get_vocab(), rag_tokenizer.generator.get_vocab())
+
+    @slow
+    def test_pretrained_token_nq_tokenizer(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
+        input_strings = [
+            "who got the first nobel prize in physics",
+            "when is the next deadpool movie being released",
+            "which mode is used for short wave broadcast service",
+            "who is the owner of reading football club",
+            "when is the next scandal episode coming out",
+            "when is the last time the philadelphia won the superbowl",
+            "what is the most current adobe flash player version",
+            "how many episodes are there in dragon ball z",
+            "what is the first step in the evolution of the eye",
+            "where is gall bladder situated in human body",
+            "what is the main mineral in lithium batteries",
+            "who is the president of usa right now",
+            "where do the greasers live in the outsiders",
+            "panda is a national animal of which country",
+            "what is the name of manchester united stadium",
+        ]
+        input_dict = tokenizer(input_strings)
+        self.assertIsNotNone(input_dict)
+
+    @slow
+    def test_pretrained_sequence_nq_tokenizer(self):
+        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+        input_strings = [
+            "who got the first nobel prize in physics",
+            "when is the next deadpool movie being released",
+            "which mode is used for short wave broadcast service",
+            "who is the owner of reading football club",
+            "when is the next scandal episode coming out",
+            "when is the last time the philadelphia won the superbowl",
+            "what is the most current adobe flash player version",
+            "how many episodes are there in dragon ball z",
+            "what is the first step in the evolution of the eye",
+            "where is gall bladder situated in human body",
+            "what is the main mineral in lithium batteries",
+            "who is the president of usa right now",
+            "where do the greasers live in the outsiders",
+            "panda is a national animal of which country",
+            "what is the name of manchester united stadium",
+        ]
+        input_dict = tokenizer(input_strings)
+        self.assertIsNotNone(input_dict)
diff --git a/test_tokenization_reformer.py b/test_tokenization_reformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..398f3fd8e6edb584847129ddead94d5342ca4be5
--- /dev/null
+++ b/test_tokenization_reformer.py
@@ -0,0 +1,371 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = ReformerTokenizer
+    rust_tokenizer_class = ReformerTokenizerFast
+    test_rust_tokenizer = True
+    test_seq2seq = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+        self.assertEqual(vocab_keys[-1], "j")
+        self.assertEqual(len(vocab_keys), 1_000)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_padding(self, max_length=15):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                # Simple input
+                s = "This is a simple input"
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+
+                # Simple input tests
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    s2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    p2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+
+    # tokenizer has no padding token
+    def test_padding_different_model_input_name(self):
+        pass
+
+    def test_full_tokenizer(self):
+        tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def big_tokenizer(self):
+        return ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
+
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [126, 32, 262, 152, 38, 72, 287]
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenization_base_hard_symbols(self):
+        symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
+        original_tokenizer_encodings = [
+            108,
+            265,
+            24,
+            111,
+            4,
+            258,
+            156,
+            35,
+            28,
+            275,
+            3,
+            259,
+            297,
+            260,
+            84,
+            4,
+            35,
+            110,
+            44,
+            8,
+            259,
+            91,
+            268,
+            21,
+            11,
+            209,
+            274,
+            109,
+            266,
+            277,
+            117,
+            86,
+            93,
+            315,
+            258,
+            278,
+            258,
+            277,
+            258,
+            0,
+            258,
+            288,
+            258,
+            319,
+            258,
+            0,
+            258,
+            0,
+            258,
+            0,
+            258,
+            0,
+            258,
+            287,
+            258,
+            315,
+            258,
+            289,
+            258,
+            278,
+            99,
+            269,
+            266,
+            262,
+            8,
+            259,
+            241,
+            4,
+            217,
+            230,
+            268,
+            266,
+            55,
+            168,
+            106,
+            75,
+            193,
+            266,
+            223,
+            27,
+            49,
+            26,
+            282,
+            25,
+            264,
+            299,
+            19,
+            26,
+            0,
+            258,
+            277,
+            117,
+            86,
+            93,
+            176,
+            183,
+            270,
+            11,
+            262,
+            42,
+            61,
+            265,
+        ]
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @require_torch
+    @slow
+    def test_torch_encode_plus_sent_to_model(self):
+        import torch
+
+        from transformers import ReformerConfig, ReformerModel
+
+        # Build sequence
+        first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
+        sequence = " ".join(first_ten_tokens)
+        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt")
+        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="pt")
+
+        config = ReformerConfig()
+        # The input gets padded during training so adjust the axial position encodings from the pretrained model value of (512, 1024)
+        config.axial_pos_shape = encoded_sequence["input_ids"].shape
+        model = ReformerModel(config)
+
+        # Reformer has config.vocab_size == tokenizer.vocab_size == len(tokenizer) - 1 = 320; len(tokenizer) is 321 (including a pad token with id 320)
+        assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size
+
+        with torch.no_grad():
+            model(**encoded_sequence)
+            model(**batch_encoded_sequence)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[108, 265, 24, 111, 4, 258, 156, 7, 51, 279, 58, 7, 76, 25, 69, 278], [140, 243, 264, 134, 17, 267, 77, 263, 22, 262, 297, 258, 304, 177, 279, 266, 14, 89, 13, 35, 261, 299, 272, 137, 275, 278]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # noqa: E501
+        # fmt: on
+
+        # This tokenizer does not know some characters like ")".
+        # That is the reason why we use very simple texts here.
+        # Also see https://github.com/huggingface/transformers/pull/11737#issuecomment-850769064
+        sequences = [
+            "This is a very simple sentence.",
+            "The quick brown fox jumps over the lazy dog.",
+        ]
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="google/reformer-crime-and-punishment",
+            revision="0e6c3decb8211d49bf881013425dc8b0448b3f5a",
+            padding=False,
+            sequences=sequences,
+        )
diff --git a/test_tokenization_roberta.py b/test_tokenization_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..746c88d0f178cadba7fc5fc7423dd20a56a69b5d
--- /dev/null
+++ b/test_tokenization_roberta.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
+from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = RobertaTokenizer
+    rust_tokenizer_class = RobertaTokenizerFast
+    test_rust_tokenizer = True
+    from_pretrained_kwargs = {"cls_token": "<s>"}
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text)  # , add_prefix_space=True)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def roberta_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+
+        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
+        self.assertListEqual(
+            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("roberta-base")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_text_from_decode = tokenizer.encode(
+            "sequence builders", add_special_tokens=True, add_prefix_space=False
+        )
+        encoded_pair_from_decode = tokenizer.encode(
+            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
+        )
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
+
+    def test_space_encoding(self):
+        tokenizer = self.get_tokenizer()
+
+        sequence = "Encode this sequence."
+        space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]
+
+        # Testing encoder arguments
+        encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
+        self.assertNotEqual(first_char, space_encoding)
+
+        encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
+        self.assertEqual(first_char, space_encoding)
+
+        tokenizer.add_special_tokens({"bos_token": "<s>"})
+        encoded = tokenizer.encode(sequence, add_special_tokens=True)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
+        self.assertNotEqual(first_char, space_encoding)
+
+        # Testing spaces after special tokens
+        mask = "<mask>"
+        tokenizer.add_special_tokens(
+            {"mask_token": AddedToken(mask, lstrip=True, rstrip=False)}
+        )  # mask token has a left space
+        mask_ind = tokenizer.convert_tokens_to_ids(mask)
+
+        sequence = "Encode <mask> sequence"
+        sequence_nospace = "Encode <mask>sequence"
+
+        encoded = tokenizer.encode(sequence)
+        mask_loc = encoded.index(mask_ind)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
+        self.assertEqual(first_char, space_encoding)
+
+        encoded = tokenizer.encode(sequence_nospace)
+        mask_loc = encoded.index(mask_ind)
+        first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
+        self.assertNotEqual(first_char, space_encoding)
+
+    def test_pretokenized_inputs(self):
+        pass
+
+    def test_embeded_special_tokens(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                sentence = "A, <mask> AllenNLP sentence."
+                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+
+                # token_type_ids should put 0 everywhere
+                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
+
+                # attention_mask should put 1 everywhere, so sum over length should be 1
+                self.assertEqual(
+                    sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
+                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
+                )
+
+                tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+
+                # Rust correctly handles the space before the mask while python doesnt
+                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+                self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+
+                self.assertSequenceEqual(
+                    tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
+                )
+                self.assertSequenceEqual(
+                    tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
+                )
diff --git a/test_tokenization_roformer.py b/test_tokenization_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c7fb65431e1c21cb0002d6865b325a8e626741
--- /dev/null
+++ b/test_tokenization_roformer.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import unittest
+
+from transformers import RoFormerTokenizer, RoFormerTokenizerFast
+from transformers.testing_utils import require_tokenizers
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+def is_rjieba_available():
+    return importlib.util.find_spec("rjieba") is not None
+
+
+def require_rjieba(test_case):
+    """
+    Decorator marking a test that requires Jieba. These tests are skipped when Jieba isn't installed.
+    """
+    if not is_rjieba_available():
+        return unittest.skip("test requires rjieba")(test_case)
+    else:
+        return test_case
+
+
+@require_rjieba
+@require_tokenizers
+class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = RoFormerTokenizer
+    rust_tokenizer_class = RoFormerTokenizerFast
+    space_between_special_tokens = True
+    test_rust_tokenizer = True
+
+    def setUp(self):
+        super().setUp()
+
+    def get_tokenizer(self, **kwargs):
+        return self.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return self.rust_tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
+
+    def get_chinese_input_output_texts(self):
+        input_text = "永和服装饰品有限公司,今天天气非常好"
+        output_text = "永和 服装 饰品 有限公司 , 今 天 天 气 非常 好"
+        return input_text, output_text
+
+    def test_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+        input_text, output_text = self.get_chinese_input_output_texts()
+        tokens = tokenizer.tokenize(input_text)
+
+        self.assertListEqual(tokens, output_text.split())
+
+        input_tokens = tokens + [tokenizer.unk_token]
+        exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens)
+
+    def test_rust_tokenizer(self):
+        tokenizer = self.get_rust_tokenizer()
+        input_text, output_text = self.get_chinese_input_output_texts()
+        tokens = tokenizer.tokenize(input_text)
+        self.assertListEqual(tokens, output_text.split())
+        input_tokens = tokens + [tokenizer.unk_token]
+        exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens)
+
+    # due to custom pre_tokenize , char_to_token may be error
+    def test_alignement_methods(self):
+        pass
diff --git a/test_tokenization_small_blenderbot.py b/test_tokenization_small_blenderbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..9169d21b431e55dc3538a5137faaf8be9ecd7d79
--- /dev/null
+++ b/test_tokenization_small_blenderbot.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the Blenderbot small tokenizer."""
+import json
+import os
+import unittest
+
+from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
+    VOCAB_FILES_NAMES,
+    BlenderbotSmallTokenizer,
+)
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BlenderbotSmallTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
+        self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "adapt act apte"
+        output_text = "adapt act apte"
+        return input_text, output_text
+
+    def test_full_blenderbot_small_tokenizer(self):
+        tokenizer = BlenderbotSmallTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "adapt act apte"
+        bpe_tokens = ["adapt", "act", "ap@@", "te"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]
+
+        input_bpe_tokens = [0, 1, 2, 3, 4, 5]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_special_tokens_small_tok(self):
+        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+        assert tok("sam").input_ids == [1384]
+        src_text = "I am a small frog."
+        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
+        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        assert src_text != decoded  # I wish it did!
+        assert decoded == "i am a small frog ."
+
+    def test_empty_word_small_tok(self):
+        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+        src_text = "I am a small frog ."
+        src_text_dot = "."
+        encoded = tok(src_text)["input_ids"]
+        encoded_dot = tok(src_text_dot)["input_ids"]
+
+        assert encoded[-1] == encoded_dot[0]
diff --git a/test_tokenization_speech_to_text.py b/test_tokenization_speech_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..649e638791248b3a102ed6cf55aba221b4192639
--- /dev/null
+++ b/test_tokenization_speech_to_text.py
@@ -0,0 +1,164 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+from pathlib import Path
+from shutil import copyfile
+
+from transformers import SPIECE_UNDERLINE, is_sentencepiece_available
+from transformers.models.speech_to_text import Speech2TextTokenizer
+from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_SP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+if is_sentencepiece_available():
+    import sentencepiece as sp
+
+
+FR_CODE = 5
+ES_CODE = 10
+
+
+@require_sentencepiece
+@require_tokenizers
+class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = Speech2TextTokenizer
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        spm_model = sp.SentencePieceProcessor()
+        spm_model.Load(SAMPLE_SP)
+        vocab = ["<s>", "<pad>", "</s>", "<unk>"]
+
+        vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        save_dir = Path(self.tmpdirname)
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+
+        tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "j")
+        self.assertEqual(len(vocab_keys), 1_001)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_001)
+
+    def test_full_tokenizer(self):
+        tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [289, 50, 14, 174, 386],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", "."],
+            # fmt: on
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [12, 25, 88, 59, 28, 23, 11, 4, 606, 351, 351, 351, 7, 16, 70, 50, 76, 84, 10, 4, 8])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", "."],
+            # fmt: on
+        )
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[3791, 797, 31, 11, 64, 797, 31, 2429, 433, 12, 1176, 12, 20, 786, 915, 142, 2413, 240, 37, 3238, 797, 31, 11, 35, 93, 915, 142, 2413, 240, 37, 5540, 567, 1276, 93, 37, 610, 40, 62, 455, 657, 1042, 123, 780, 177, 37, 309, 241, 1298, 514, 20, 292, 2737, 114, 2469, 241, 85, 64, 302, 548, 528, 423, 4, 509, 406, 423, 37, 601, 4, 777, 302, 548, 528, 423, 284, 4, 3388, 511, 459, 4, 3555, 40, 321, 302, 705, 4, 3388, 511, 583, 326, 5, 5, 5, 62, 3310, 560, 177, 2680, 217, 1508, 32, 31, 853, 418, 64, 583, 511, 1605, 62, 35, 93, 560, 177, 2680, 217, 1508, 1521, 64, 583, 511, 519, 62, 20, 1515, 764, 20, 149, 261, 5625, 7972, 20, 5540, 567, 1276, 93, 3925, 1675, 11, 15, 802, 7972, 576, 217, 1508, 11, 35, 93, 1253, 2441, 15, 289, 652, 31, 416, 321, 3842, 115, 40, 911, 8, 476, 619, 4, 380, 142, 423, 335, 240, 35, 93, 264, 8, 11, 335, 569, 420, 163, 5, 2], [260, 548, 528, 423, 20, 451, 20, 2681, 1153, 3434, 20, 5540, 37, 567, 126, 1253, 2441, 3376, 449, 210, 431, 1563, 177, 767, 5540, 11, 1203, 472, 11, 2953, 685, 285, 364, 706, 1153, 20, 6799, 20, 2869, 20, 4464, 126, 40, 2429, 20, 1040, 866, 2664, 418, 20, 318, 20, 1726, 186, 20, 265, 522, 35, 93, 2191, 4634, 20, 1040, 12, 6799, 15, 228, 2356, 142, 31, 11, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2575, 2666, 684, 1582, 1176, 12, 627, 149, 619, 20, 4902, 563, 11, 20, 149, 261, 3420, 2356, 174, 142, 4714, 131, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="facebook/s2t-small-mustc-en-de-st",
+            revision="a14f04cf0776c02f62a8cb800cf7909e15ea23ad",
+        )
+
+
+@require_sentencepiece
+class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
+    checkpoint_name = "valhalla/s2t_mustc_multilinguial_medium"
+
+    french_text = "C'est trop cool"
+    spanish_text = "Esto es genial"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer: Speech2TextTokenizer = Speech2TextTokenizer.from_pretrained(cls.checkpoint_name)
+        return cls
+
+    def check_language_codes(self):
+        self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
+        self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)
+        self.assertEqual(self.tokenizer.lang_code_to_id["it"], 9)
+        self.assertEqual(self.tokenizer.lang_code_to_id["de"], 11)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.tokenizer.vocab_size, 10_000)
+
+    def test_tokenizer_decode_ignores_language_codes(self):
+        self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
+        generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
+        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
+        self.assertEqual(result, expected_spanish)
+        self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_tokenizer_adds_special_tokens(self):
+        self.tokenizer.tgt_lang = "fr"
+        encoded = self.tokenizer(self.french_text).input_ids
+        self.assertEqual(encoded[0], FR_CODE)
+        self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)
+
+    def test_tgt_lang_setter(self):
+        self.tokenizer.tgt_lang = "fr"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [FR_CODE])
+
+        self.tokenizer.tgt_lang = "es"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [ES_CODE])
diff --git a/test_tokenization_squeezebert.py b/test_tokenization_squeezebert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3637717a0c76cefea31c4c9021abec8be823035a
--- /dev/null
+++ b/test_tokenization_squeezebert.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
+from transformers.testing_utils import require_tokenizers, slow
+
+from .test_tokenization_bert import BertTokenizationTest
+
+
+@require_tokenizers
+class SqueezeBertTokenizationTest(BertTokenizationTest):
+
+    tokenizer_class = SqueezeBertTokenizer
+    rust_tokenizer_class = SqueezeBertTokenizerFast
+    test_rust_tokenizer = True
+
+    def get_rust_tokenizer(self, **kwargs):
+        return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-mnli-headless")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
diff --git a/test_tokenization_t5.py b/test_tokenization_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..89557387b6682b0a1b6aadb2a4b39a605251cc81
--- /dev/null
+++ b/test_tokenization_t5.py
@@ -0,0 +1,307 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
+from transformers.file_utils import cached_property, is_tf_available, is_torch_available
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+if is_torch_available():
+    FRAMEWORK = "pt"
+elif is_tf_available():
+    FRAMEWORK = "tf"
+else:
+    FRAMEWORK = "jax"
+
+
+@require_sentencepiece
+@require_tokenizers
+class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = T5Tokenizer
+    rust_tokenizer_class = T5TokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+        self.assertEqual(vocab_keys[-1], "<pad>")
+        self.assertEqual(len(vocab_keys), 1_101)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_100)
+
+    def test_full_tokenizer(self):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def t5_base_tokenizer(self):
+        return T5Tokenizer.from_pretrained("t5-base")
+
+    @cached_property
+    def t5_base_tokenizer_fast(self):
+        return T5TokenizerFast.from_pretrained("t5-base")
+
+    def get_tokenizer(self, **kwargs) -> T5Tokenizer:
+        return self.tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
+        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_eos_treatment(self):
+        tokenizer = self.t5_base_tokenizer
+        batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])
+        batch_without_eos_added = tokenizer(["hi", "I went to the gym", ""])
+        self.assertListEqual(batch_with_eos_added["input_ids"], batch_without_eos_added["input_ids"])
+
+    def test_prepare_batch(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, tokenizer.eos_token_id]
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        self.assertIsInstance(batch, BatchEncoding)
+
+        if FRAMEWORK != "jax":
+            result = list(batch.input_ids.numpy()[0])
+        else:
+            result = list(batch.input_ids.tolist()[0])
+
+        self.assertListEqual(expected_src_tokens, result)
+
+        self.assertEqual((2, 9), batch.input_ids.shape)
+        self.assertEqual((2, 9), batch.attention_mask.shape)
+
+    def test_empty_target_text(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        # check if input_ids are returned and no decoder_input_ids
+        self.assertIn("input_ids", batch)
+        self.assertIn("attention_mask", batch)
+        self.assertNotIn("decoder_input_ids", batch)
+        self.assertNotIn("decoder_attention_mask", batch)
+
+    def test_max_length(self):
+        tokenizer = self.t5_base_tokenizer
+        tgt_text = [
+            "Summary of the text.",
+            "Another summary.",
+        ]
+        with tokenizer.as_target_tokenizer():
+            targets = tokenizer(
+                tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
+            )
+        self.assertEqual(32, targets["input_ids"].shape[1])
+
+    def test_outputs_not_longer_than_maxlen(self):
+        tokenizer = self.t5_base_tokenizer
+
+        batch = tokenizer(
+            ["I am a small frog" * 1000, "I am a small frog"], padding=True, truncation=True, return_tensors=FRAMEWORK
+        )
+        self.assertIsInstance(batch, BatchEncoding)
+        self.assertEqual(batch.input_ids.shape, (2, 512))
+
+    def test_eos_in_input(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization. </s>"]
+        tgt_text = ["Summary of the text. </s>"]
+        expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1]
+        expected_tgt_tokens = [20698, 13, 8, 1499, 5, 1]
+
+        batch = tokenizer(src_text)
+        with tokenizer.as_target_tokenizer():
+            targets = tokenizer(tgt_text)
+
+        self.assertEqual(expected_src_tokens, batch["input_ids"][0])
+        self.assertEqual(expected_tgt_tokens, targets["input_ids"][0])
+
+    def test_token_type_ids(self):
+        src_text_1 = ["A first paragraph for summarization."]
+        src_text_2 = ["A second paragraph for summarization."]
+
+        fast_token_type_ids = self.t5_base_tokenizer_fast(
+            src_text_1, src_text_2, add_special_tokens=True, return_token_type_ids=True
+        ).token_type_ids
+        slow_token_type_ids = self.t5_base_tokenizer(
+            src_text_1, src_text_2, add_special_tokens=True, return_token_type_ids=True
+        ).token_type_ids
+
+        self.assertEqual(slow_token_type_ids, fast_token_type_ids)
+        self.assertEqual(len(slow_token_type_ids[0]), 18)
+
+    def test_fast_and_slow_same_result(self):
+        src_text = "<pad> Today is <unk> nice day </s>"
+        tgt_ids = [0, 1960, 19, 2, 1245, 239, 1]
+        tgt_text = "<pad> Today is<unk> nice day</s>"
+
+        fast_ids = self.t5_base_tokenizer_fast(src_text, add_special_tokens=False).input_ids
+        slow_ids = self.t5_base_tokenizer(src_text, add_special_tokens=False).input_ids
+        self.assertEqual(tgt_ids, fast_ids)
+        self.assertEqual(tgt_ids, slow_ids)
+
+        fast_text = self.t5_base_tokenizer_fast.decode(fast_ids)
+        slow_text = self.t5_base_tokenizer.decode(fast_ids)
+        self.assertEqual(tgt_text, fast_text)
+        self.assertEqual(tgt_text, slow_text)
+
+    def test_special_tokens_initialization(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+
+                added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
+
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                )
+                tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
+                )
+                tokenizer_p = self.tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                )
+
+                p_output = tokenizer_p.encode("Hey this is a <special> token")
+                r_output = tokenizer_r.encode("Hey this is a <special> token")
+                cr_output = tokenizer_cr.encode("Hey this is a <special> token")
+
+                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
+
+                self.assertEqual(p_output, r_output)
+                self.assertEqual(cr_output, r_output)
+                self.assertTrue(special_token_id in p_output)
+                self.assertTrue(special_token_id in r_output)
+                self.assertTrue(special_token_id in cr_output)
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[31220, 7, 41, 14034, 801, 38, 3, 102, 63, 17, 127, 524, 18, 7031, 2032, 277, 11, 3, 102, 63, 17, 127, 524, 18, 2026, 17, 10761, 18, 7041, 61, 795, 879, 18, 19681, 4648, 7, 41, 12920, 382, 6, 350, 6383, 4949, 6, 2158, 12920, 382, 9, 6, 3, 4, 11160, 6, 2043, 17153, 279, 49, 17, 6, 3, 4, 434, 9688, 11439, 21, 6869, 10509, 17725, 41, 567, 9138, 61, 11, 6869, 10509, 11946, 41, 18207, 517, 61, 28, 147, 3538, 1220, 7140, 10761, 2250, 16, 910, 1220, 8024, 11, 1659, 1413, 32, 883, 2020, 344, 2215, 226, 6, 12901, 382, 127, 524, 11, 4738, 7, 127, 15390, 5, 1], [272, 24203, 19, 876, 12, 554, 18, 9719, 1659, 2647, 26352, 6497, 7, 45, 73, 9339, 400, 26, 1499, 57, 22801, 10760, 30, 321, 646, 11, 269, 2625, 16, 66, 7500, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [37, 1704, 4216, 3, 20400, 4418, 7, 147, 8, 19743, 1782, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="t5-base",
+            revision="5a7ff2d8f5117c194c7e32ec1ccbf04642cca99b",
+        )
diff --git a/test_tokenization_tapas.py b/test_tokenization_tapas.py
new file mode 100644
index 0000000000000000000000000000000000000000..357fa3773d9b5749c6b94e7daaabf4211e2e9e6f
--- /dev/null
+++ b/test_tokenization_tapas.py
@@ -0,0 +1,1203 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+import shutil
+import tempfile
+import unittest
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from transformers import AddedToken
+from transformers.models.tapas.tokenization_tapas import (
+    VOCAB_FILES_NAMES,
+    BasicTokenizer,
+    TapasTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.testing_utils import (
+    is_pt_tf_cross_test,
+    require_pandas,
+    require_scatter,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
+
+from .test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
+
+
+@require_tokenizers
+@require_pandas
+class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = TapasTokenizer
+    test_rust_tokenizer = False
+    space_between_special_tokens = True
+    from_pretrained_filter = filter_non_english
+    test_seq2seq = False
+
+    def get_table(
+        self,
+        tokenizer: TapasTokenizer,
+        length=5,
+    ):
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+
+        if length == 0:
+            data = {}
+        else:
+            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
+
+        table = pd.DataFrame.from_dict(data)
+
+        return table
+
+    def get_table_and_query(
+        self,
+        tokenizer: TapasTokenizer,
+        length=5,
+    ):
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+        table = self.get_table(tokenizer, length=length - 3)
+        query = " ".join(toks[:3])
+
+        return table, query
+
+    def get_clean_sequence(
+        self,
+        tokenizer: TapasTokenizer,
+        with_prefix_space=False,
+        max_length=20,
+        min_length=5,
+        empty_table: bool = False,
+        add_special_tokens: bool = True,
+        return_table_and_query: bool = False,
+    ):
+
+        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
+
+        if empty_table:
+            table = pd.DataFrame.from_dict({})
+            query = " ".join(toks[:min_length])
+        else:
+            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
+            table = pd.DataFrame.from_dict(data)
+            query = " ".join(toks[:3])
+
+        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
+        output_txt = tokenizer.decode(output_ids)
+
+        assert len(output_ids) >= min_length, "Update the code to generate the sequences so that they are larger"
+        assert len(output_ids) <= max_length, "Update the code to generate the sequences so that they are smaller"
+
+        if return_table_and_query:
+            return output_txt, output_ids, table, query
+
+        return output_txt, output_ids
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # With lower casing
+        tokenizer = self.get_tokenizer(do_lower_case=True)
+        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual(
+            [tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], ["[EMPTY]"], ["[UNK]"]]
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("google/tapas-base-finetuned-wtq")
+
+        empty_table = self.get_table(tokenizer, length=0)
+        table = self.get_table(tokenizer, length=10)
+
+        text = tokenizer.encode(table, add_special_tokens=False)
+        text_2 = tokenizer.encode(empty_table, "multi-sequence build", add_special_tokens=False)
+
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_pair == [101] + text + [102] + text_2
+
+    def test_offsets_with_special_characters(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
+                    sentence,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    return_offsets_mapping=True,
+                    add_special_tokens=True,
+                )
+
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
+                expected_results = (
+                    [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "A"),
+                        ((1, 2), ","),
+                        ((3, 5), "na"),
+                        ((5, 6), "##ï"),
+                        ((6, 8), "##ve"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "Allen"),
+                        ((21, 23), "##NL"),
+                        ((23, 24), "##P"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                    if not do_lower_case
+                    else [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "a"),
+                        ((1, 2), ","),
+                        ((3, 8), "naive"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "allen"),
+                        ((21, 23), "##nl"),
+                        ((23, 24), "##p"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                )
+
+                self.assertEqual(
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
+
+    def test_add_special_tokens(self):
+        tokenizers: List[TapasTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_table = self.get_table(tokenizer, length=0)
+
+                special_token = "[SPECIAL_TOKEN]"
+
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+
+                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+
+    def test_add_tokens_tokenizer(self):
+        tokenizers: List[TapasTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    table,
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
+                    add_special_tokens=False,
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-2], tokens[-3])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
+
+    @require_tokenizers
+    def test_encode_decode_with_spaces(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+
+                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
+                tokenizer.add_tokens(new_toks)
+                input = "[ABC][DEF][ABC][DEF]"
+                if self.space_between_special_tokens:
+                    output = "[ABC] [DEF] [ABC] [DEF]"
+                else:
+                    output = input
+                encoded = tokenizer.encode(table, input, add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+                self.assertIn(decoded, [output, output.lower()])
+
+    def test_encode_plus_with_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence = "Sequence"
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_size = 10
+                padding_idx = tokenizer.pad_token_id
+                token_type_padding_idx = tokenizer.pad_token_type_id
+
+                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
+                input_ids = encoded_sequence["input_ids"]
+                special_tokens_mask = encoded_sequence["special_tokens_mask"]
+                sequence_length = len(input_ids)
+
+                # Test 'longest' and 'no_padding' don't do anything
+                tokenizer.padding_side = "right"
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                assert sequence_length == not_padded_sequence_length
+                assert input_ids == not_padded_input_ids
+                assert special_tokens_mask == not_padded_special_tokens_mask
+
+                not_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    padding=False,
+                    return_special_tokens_mask=True,
+                )
+                not_padded_input_ids = not_padded_sequence["input_ids"]
+
+                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
+                not_padded_sequence_length = len(not_padded_input_ids)
+
+                assert sequence_length == not_padded_sequence_length
+                assert input_ids == not_padded_input_ids
+                assert special_tokens_mask == not_padded_special_tokens_mask
+
+                # Test right padding
+                tokenizer.padding_side = "right"
+
+                right_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                right_padded_input_ids = right_padded_sequence["input_ids"]
+
+                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
+                right_padded_sequence_length = len(right_padded_input_ids)
+
+                assert sequence_length + padding_size == right_padded_sequence_length
+                assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
+                assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
+
+                # Test left padding
+                tokenizer.padding_side = "left"
+                left_padded_sequence = tokenizer.encode_plus(
+                    table,
+                    sequence,
+                    max_length=sequence_length + padding_size,
+                    padding="max_length",
+                    return_special_tokens_mask=True,
+                )
+                left_padded_input_ids = left_padded_sequence["input_ids"]
+                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
+                left_padded_sequence_length = len(left_padded_input_ids)
+
+                assert sequence_length + padding_size == left_padded_sequence_length
+                assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
+                assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
+
+                if "token_type_ids" in tokenizer.model_input_names:
+                    token_type_ids = encoded_sequence["token_type_ids"]
+                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
+                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
+
+                    assert (
+                        token_type_ids + [[token_type_padding_idx] * 7] * padding_size == right_padded_token_type_ids
+                    )
+                    assert [[token_type_padding_idx] * 7] * padding_size + token_type_ids == left_padded_token_type_ids
+
+                if "attention_mask" in tokenizer.model_input_names:
+                    attention_mask = encoded_sequence["attention_mask"]
+                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
+                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
+
+                    assert attention_mask + [0] * padding_size == right_padded_attention_mask
+                    assert [0] * padding_size + attention_mask == left_padded_attention_mask
+
+    def test_internal_consistency(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                input_text, output_text = self.get_input_output_texts(tokenizer)
+
+                tokens = tokenizer.tokenize(input_text)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
+                self.assertListEqual(ids, ids_2)
+
+                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+                self.assertNotEqual(len(tokens_2), 0)
+                text_2 = tokenizer.decode(ids)
+                self.assertIsInstance(text_2, str)
+
+                self.assertEqual(text_2, output_text)
+
+    def test_mask_output(self):
+        tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table, query = self.get_table_and_query(tokenizer)
+
+                if (
+                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
+                    and "token_type_ids" in tokenizer.model_input_names
+                ):
+                    information = tokenizer.encode_plus(table, query, add_special_tokens=True)
+                    sequences, mask = information["input_ids"], information["token_type_ids"]
+                    self.assertEqual(len(sequences), len(mask))
+
+    @unittest.skip("TAPAS tokenizer only handles two sequences.")
+    def test_maximum_encoding_length_pair_input(self):
+        pass
+
+    @unittest.skip("TAPAS tokenizer only handles two sequences.")
+    def test_maximum_encoding_length_single_input(self):
+        pass
+
+    def test_number_of_added_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                table, query = self.get_table_and_query(tokenizer)
+
+                sequences = tokenizer.encode(table, query, add_special_tokens=False)
+                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
+
+                # Method is implemented (e.g. not GPT-2)
+                if len(attached_sequences) != 2:
+                    self.assertEqual(
+                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
+                    )
+
+    def test_padding_to_max_length(self):
+        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer)
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                # FIXME: the next line should be padding(max_length) to avoid warning
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding=True
+                )
+                padded_sequence_length = len(padded_sequence)
+                assert sequence_length + padding_size == padded_sequence_length
+                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+
+                # Check that nothing is done when a maximum length is not specified
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                assert sequence_length == padded_sequence_right_length
+                assert encoded_sequence == padded_sequence_right
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                # Test not batched
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
+                encoded_sequences_2 = tokenizer(table, sequences[0])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test not batched pairs
+                table = self.get_table(tokenizer, length=10)
+                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
+                encoded_sequences_2 = tokenizer(table, sequences[1])
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+                # Test batched
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
+                encoded_sequences_2 = tokenizer(table, sequences)
+                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
+
+    def test_batch_encode_plus_batch_sequence_length(self):
+        # Tests that all encoded values have the correct size
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+                maximum_length = len(
+                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
+                )
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences_padded = [
+                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
+                    for sequence in sequences
+                ]
+
+                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
+                self.assertListEqual(
+                    encoded_sequences_padded,
+                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
+                )
+
+                # check 'longest' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=maximum_length + 10, padding="longest"
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+                # check 'no_padding' is unsensitive to a max length
+                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
+                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=maximum_length + 10, padding=False
+                )
+                for key in encoded_sequences_batch_padded_1.keys():
+                    self.assertListEqual(
+                        encoded_sequences_batch_padded_1[key],
+                        encoded_sequences_batch_padded_2[key],
+                    )
+
+    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
+    def test_batch_encode_plus_overflowing_tokens(self):
+        pass
+
+    def test_batch_encode_plus_padding(self):
+        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
+
+        # Right padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+        # Left padding tests
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                tokenizer.padding_side = "left"
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                max_length = 100
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequences)
+
+                encoded_sequences = [
+                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
+                    for sequence in sequences
+                ]
+                encoded_sequences_batch = tokenizer.batch_encode_plus(
+                    table, sequences, max_length=max_length, padding="max_length"
+                )
+                self.assertListEqual(
+                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
+                )
+
+    def test_padding_to_multiple_of(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                if tokenizer.pad_token is None:
+                    self.skipTest("No padding token.")
+                else:
+                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
+                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
+                    for key, value in empty_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+                    # Should also work with truncation
+                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
+                    for key, value in normal_tokens.items():
+                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
+
+    @unittest.skip("TAPAS cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
+    def test_prepare_for_model(self):
+        pass
+
+    def test_tokenizer_slow_store_full_signature(self):
+        signature = inspect.signature(self.tokenizer_class.__init__)
+        tokenizer = self.get_tokenizer()
+
+        for parameter_name, parameter in signature.parameters.items():
+            if parameter.default != inspect.Parameter.empty:
+                self.assertIn(parameter_name, tokenizer.init_kwargs)
+
+    def test_special_tokens_mask_input_pairs(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                empty_table = self.get_table(tokenizer, length=0)
+                table = self.get_table(tokenizer, length=10)
+                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
+                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    table,
+                    sequence_0,
+                    add_special_tokens=True,
+                    return_special_tokens_mask=True,
+                    # add_prefix_space=False,
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [
+                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+                ]
+                filtered_sequence = [x for x in filtered_sequence if x is not None]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    def test_special_tokens_mask(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence_0 = "Encode this."
+                # Testing single inputs
+                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+
+                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                table = self.get_table(tokenizer, length=0)
+                tmpdirname = tempfile.mkdtemp()
+
+                sample_text = " He is very happy, UNwant\u00E9d,running"
+                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
+                before_vocab = tokenizer.get_vocab()
+                tokenizer.save_pretrained(tmpdirname)
+
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
+                after_vocab = after_tokenizer.get_vocab()
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertDictEqual(before_vocab, after_vocab)
+
+                shutil.rmtree(tmpdirname)
+
+    def test_right_and_left_padding(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                table = self.get_table(tokenizer, length=0)
+                sequence = "Sequence"
+                padding_size = 10
+
+                # check correct behaviour if no pad_token_id exists and add it eventually
+                self._check_no_pad_token_padding(tokenizer, sequence)
+
+                padding_idx = tokenizer.pad_token_id
+
+                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "right"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                assert sequence_length + padding_size == padded_sequence_length
+                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+
+                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+                tokenizer.padding_side = "left"
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+                padded_sequence = tokenizer.encode(
+                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
+                )
+                padded_sequence_length = len(padded_sequence)
+                assert sequence_length + padding_size == padded_sequence_length
+                assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
+
+                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
+                encoded_sequence = tokenizer.encode(table, sequence)
+                sequence_length = len(encoded_sequence)
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
+                padded_sequence_right_length = len(padded_sequence_right)
+                assert sequence_length == padded_sequence_right_length
+                assert encoded_sequence == padded_sequence_right
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
+                padded_sequence_left_length = len(padded_sequence_left)
+                assert sequence_length == padded_sequence_left_length
+                assert encoded_sequence == padded_sequence_left
+
+                tokenizer.padding_side = "right"
+                padded_sequence_right = tokenizer.encode(table, sequence)
+                padded_sequence_right_length = len(padded_sequence_right)
+                assert sequence_length == padded_sequence_right_length
+                assert encoded_sequence == padded_sequence_right
+
+                tokenizer.padding_side = "left"
+                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
+                padded_sequence_left_length = len(padded_sequence_left)
+                assert sequence_length == padded_sequence_left_length
+                assert encoded_sequence == padded_sequence_left
+
+    def test_token_type_ids(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                empty_table = self.get_table(tokenizer, length=0)
+                seq_0 = "Test this method."
+
+                # We want to have sequence 0 and sequence 1 are tagged
+                # respectively with 0 and 1 token_ids
+                # (regardless of whether the model use token type ids)
+                # We use this assumption in the QA pipeline among other place
+                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
+
+                # Assert that the token type IDs have the same length as the input IDs
+                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
+
+                # Assert that each token type ID has 7 values
+                self.assertTrue(all(len(token_type_ids) == 7 for token_type_ids in output["token_type_ids"]))
+
+                # Do the same test as modeling common.
+                self.assertIn(0, output["token_type_ids"][0])
+
+    @require_torch
+    @slow
+    @require_scatter
+    def test_torch_encode_plus_sent_to_model(self):
+        import torch
+
+        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
+
+        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
+
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+
+                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
+                    return
+
+                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
+                config = config_class()
+
+                if config.is_encoder_decoder or config.pad_token_id is None:
+                    return
+
+                model = model_class(config)
+
+                # Make sure the model contains at least the full vocabulary size in its embedding matrix
+                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
+                assert (
+                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
+                    if is_using_common_embeddings
+                    else True
+                )
+
+                # Build sequence
+                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
+                sequence = " ".join(first_ten_tokens)
+                table = self.get_table(tokenizer, length=0)
+                encoded_sequence = tokenizer.encode_plus(table, sequence, return_tensors="pt")
+                batch_encoded_sequence = tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="pt")
+                # This should not fail
+
+                with torch.no_grad():  # saves some time
+                    model(**encoded_sequence)
+                    model(**batch_encoded_sequence)
+
+    @unittest.skip("TAPAS doesn't handle pre-tokenized inputs.")
+    def test_pretokenized_inputs(self):
+        pass
+
+    @slow
+    def test_tapas_truncation_integration_test(self):
+        data = {
+            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+            "Age": ["56", "45", "59"],
+            "Number of movies": ["87", "53", "69"],
+            "Date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
+        }
+        queries = [
+            "When was Brad Pitt born?",
+            "Which actor appeared in the least number of movies?",
+            "What is the average number of movies?",
+        ]
+        table = pd.DataFrame.from_dict(data)
+
+        tokenizer = TapasTokenizer.from_pretrained("lysandre/tapas-temporary-repo", model_max_length=512)
+
+        for i in range(12):
+            # The table cannot even encode the headers, so raise an error
+            with self.assertRaises(ValueError):
+                tokenizer.encode(table=table, query=queries[0], max_length=i, truncation="drop_rows_to_fit")
+
+        for i in range(12, 512):
+            new_encoded_inputs = tokenizer.encode(
+                table=table, query=queries[0], max_length=i, truncation="drop_rows_to_fit"
+            )
+
+            # Ensure that the input IDs are less than the max length defined.
+            self.assertLessEqual(len(new_encoded_inputs), i)
+
+        tokenizer.model_max_length = 20
+        new_encoded_inputs = tokenizer.encode(table=table, query=queries[0], truncation=True)
+        dropped_encoded_inputs = tokenizer.encode(table=table, query=queries[0], truncation="drop_rows_to_fit")
+
+        # Ensure that the input IDs are still truncated when no max_length is specified
+        self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
+        self.assertLessEqual(len(new_encoded_inputs), 20)
+
+    @is_pt_tf_cross_test
+    def test_batch_encode_plus_tensors(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequences = [
+                    "Testing batch encode plus",
+                    "Testing batch encode plus with different sequence lengths",
+                    "Testing batch encode plus with different sequence lengths correctly pads",
+                ]
+
+                table = self.get_table(tokenizer, length=0)
+
+                # A Tensor cannot be build by sequences which are not the same size
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
+                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
+
+                if tokenizer.pad_token_id is None:
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding=True,
+                        return_tensors="pt",
+                    )
+                    self.assertRaises(
+                        ValueError,
+                        tokenizer.batch_encode_plus,
+                        table,
+                        sequences,
+                        padding="longest",
+                        return_tensors="tf",
+                    )
+                else:
+                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
+                    tensorflow_tensor = tokenizer.batch_encode_plus(
+                        table, sequences, padding="longest", return_tensors="tf"
+                    )
+                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
+
+                    for key in encoded_sequences.keys():
+                        pytorch_value = pytorch_tensor[key].tolist()
+                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
+                        encoded_value = encoded_sequences[key]
+
+                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
+
+    @slow
+    def test_tapas_integration_test(self):
+        data = {
+            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+            "Age": ["56", "45", "59"],
+            "Number of movies": ["87", "53", "69"],
+            "Date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
+        }
+        queries = [
+            "When was Brad Pitt born?",
+            "Which actor appeared in the least number of movies?",
+            "What is the average number of movies?",
+        ]
+        table = pd.DataFrame.from_dict(data)
+
+        tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq", model_max_length=512)
+
+        # fmt: off
+        expected_results = {'input_ids':[101,2043,2001,8226,15091,2141,1029,102,5889,2287,2193,1997,5691,3058,1997,4182,8226,15091,5179,6584,2324,2285,3699,14720,4487,6178,9488,3429,5187,2340,2281,3326,2577,18856,7828,3240,5354,6353,1020,2089,3777],'attention_mask':[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],'token_type_ids':[[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[1,1,0,0,0,0,0],[1,2,0,0,0,0,0],[1,3,0,0,0,0,0],[1,3,0,0,0,0,0],[1,3,0,0,0,0,0],[1,4,0,0,0,0,0],[1,4,0,0,0,0,0],[1,4,0,0,0,0,0],[1,1,1,0,0,0,0],[1,1,1,0,0,0,0],[1,2,1,0,2,2,0],[1,3,1,0,3,1,0],[1,4,1,0,2,2,0],[1,4,1,0,2,2,0],[1,4,1,0,2,2,0],[1,1,2,0,0,0,0],[1,1,2,0,0,0,0],[1,1,2,0,0,0,0],[1,1,2,0,0,0,0],[1,2,2,0,1,3,0],[1,3,2,0,1,3,0],[1,4,2,0,3,1,0],[1,4,2,0,3,1,0],[1,4,2,0,3,1,0],[1,1,3,0,0,0,0],[1,1,3,0,0,0,0],[1,1,3,0,0,0,0],[1,1,3,0,0,0,0],[1,2,3,0,3,1,0],[1,3,3,0,2,2,0],[1,4,3,0,1,3,0],[1,4,3,0,1,3,0],[1,4,3,0,1,3,0]]}  # noqa: E231
+        # fmt: on
+
+        new_encoded_inputs = tokenizer.encode_plus(table=table, query=queries[0])
+
+        self.assertDictEqual(dict(new_encoded_inputs), expected_results)
+
+    @slow
+    def test_full_tokenizer(self):
+        data = [
+            ["Pos", "No", "Driver", "Team", "Laps", "Time/Retired", "Grid", "Points"],
+            ["1", "32", "Patrick Carpentier", "Team Player's", "87", "1:48:11.023", "1", "22"],
+            ["2", "1", "Bruno Junqueira", "Newman/Haas Racing", "87", "+0.8 secs", "2", "17"],
+            ["3", "3", "Paul Tracy", "Team Player's", "87", "+28.6 secs", "3", "14"],
+            ["4", "9", "Michel Jourdain, Jr.", "Team Rahal", "87", "+40.8 secs", "13", "12"],
+            ["5", "34", "Mario Haberfeld", "Mi-Jack Conquest Racing", "87", "+42.1 secs", "6", "10"],
+            ["6", "20", "Oriol Servia", "Patrick Racing", "87", "+1:00.2", "10", "8"],
+            ["7", "51", "Adrian Fernandez", "Fernandez Racing", "87", "+1:01.4", "5", "6"],
+            ["8", "12", "Jimmy Vasser", "American Spirit Team Johansson", "87", "+1:01.8", "8", "5"],
+            ["9", "7", "Tiago Monteiro", "Fittipaldi-Dingman Racing", "86", "+ 1 Lap", "15", "4"],
+            ["10", "55", "Mario Dominguez", "Herdez Competition", "86", "+ 1 Lap", "11", "3"],
+            ["11", "27", "Bryan Herta", "PK Racing", "86", "+ 1 Lap", "12", "2"],
+            ["12", "31", "Ryan Hunter-Reay", "American Spirit Team Johansson", "86", "+ 1 Lap", "17", "1"],
+            ["13", "19", "Joel Camathias", "Dale Coyne Racing", "85", "+ 2 Laps", "18", "0"],
+            ["14", "33", "Alex Tagliani", "Rocketsports Racing", "85", "+ 2 Laps", "14", "0"],
+            ["15", "4", "Roberto Moreno", "Herdez Competition", "85", "+ 2 Laps", "9", "0"],
+            ["16", "11", "Geoff Boss", "Dale Coyne Racing", "83", "Mechanical", "19", "0"],
+            ["17", "2", "Sebastien Bourdais", "Newman/Haas Racing", "77", "Mechanical", "4", "0"],
+            ["18", "15", "Darren Manning", "Walker Racing", "12", "Mechanical", "7", "0"],
+            ["19", "5", "Rodolfo Lavin", "Walker Racing", "10", "Mechanical", "16", "0"],
+        ]
+        query = "what were the drivers names?"
+        table = pd.DataFrame.from_records(data[1:], columns=data[0])
+
+        tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq", model_max_length=512)
+        model_inputs = tokenizer(table, query, padding="max_length")
+
+        input_ids = model_inputs["input_ids"]
+        token_type_ids = np.array(model_inputs["token_type_ids"])
+        segment_ids = token_type_ids[:, 0]
+        column_ids = token_type_ids[:, 1]
+        row_ids = token_type_ids[:, 2]
+
+        # fmt: off
+        expected_results = {'input_ids':[101,2054,2020,1996,6853,3415,1029,102,13433,2015,2053,4062,2136,10876,2051,1013,3394,8370,2685,1015,3590,4754,29267,4765,3771,2136,2447,1005,1055,6584,1015,1024,4466,1024,2340,1012,6185,2509,1015,2570,1016,1015,10391,12022,4226,7895,10625,1013,22996,3868,6584,1009,1014,1012,1022,10819,2015,1016,2459,1017,1017,2703,10555,2136,2447,1005,1055,6584,1009,2654,1012,1020,10819,2015,1017,2403,1018,1023,8709,8183,3126,21351,2078,1010,3781,1012,2136,10958,8865,6584,1009,2871,1012,1022,10819,2015,2410,2260,1019,4090,7986,5292,5677,8151,2771,1011,2990,9187,3868,6584,1009,4413,1012,1015,10819,2015,1020,2184,1020,2322,2030,20282,14262,9035,4754,3868,6584,1009,1015,1024,4002,1012,1016,2184,1022,1021,4868,7918,12023,12023,3868,6584,1009,1015,1024,5890,1012,1018,1019,1020,1022,2260,5261,12436,18116,2137,4382,2136,26447,6584,1009,1015,1024,5890,1012,1022,1022,1019,1023,1021,27339,3995,10125,9711,4906,25101,24657,1011,22033,2386,3868,6564,1009,1015,5001,2321,1018,2184,4583,7986,14383,2075,29488,14906,9351,2971,6564,1009,1015,5001,2340,1017,2340,2676,8527,2014,2696,1052,2243,3868,6564,1009,1015,5001,2260,1016,2260,2861,4575,4477,1011,2128,4710,2137,4382,2136,26447,6564,1009,1015,5001,2459,1015,2410,2539,8963,11503,25457,3022,8512,2522,9654,3868,5594,1009,1016,10876,2324,1014,2403,3943,4074,6415,15204,2072,12496,25378,3868,5594,1009,1016,10876,2403,1014,2321,1018,10704,17921,14906,9351,2971,5594,1009,1016,10876,1023,1014,2385,2340,14915,5795,8512,2522,9654,3868,6640,6228,2539,1014,2459,1016,28328,8945,3126,21351,2015,10625,1013,22996,3868,6255,6228,1018,1014,2324,2321,12270,11956,5232,3868,2260,6228,1021,1014,2539,1019,8473,28027,2080,2474,6371,5232,3868,2184,6228,2385,1014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],'column_ids':[0,0,0,0,0,0,0,0,1,1,2,3,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,5,6,6,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,3,3,3,3,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,3,4,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,4,4,4,4,5,6,7,8,1,2,3,3,3,3,3,4,4,4,4,5,6,7,8,1,2,3,3,4,4,5,6,7,8,1,2,3,3,3,3,3,4,4,5,6,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],'row_ids':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,19,19,19,19,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],'segment_ids':[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}  # noqa: E231
+        # fmt: on
+
+        self.assertListEqual(input_ids, expected_results["input_ids"])
+        self.assertListEqual(segment_ids.tolist(), expected_results["segment_ids"])
+        self.assertListEqual(column_ids.tolist(), expected_results["column_ids"])
+        self.assertListEqual(row_ids.tolist(), expected_results["row_ids"])
+
+    @unittest.skip("Skip this test while all models are still to be uploaded.")
+    def test_pretrained_model_lists(self):
+        pass
+
+    @unittest.skip("Doesn't support another framework than PyTorch")
+    def test_np_encode_plus_sent_to_model(self):
+        pass
diff --git a/test_tokenization_transfo_xl.py b/test_tokenization_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fab369484450fc9b980163ab296ec1d264a3bebb
--- /dev/null
+++ b/test_tokenization_transfo_xl.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers.models.transfo_xl.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = TransfoXLTokenizer
+    test_rust_tokenizer = False
+    test_seq2seq = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "<unk>",
+            "[CLS]",
+            "[SEP]",
+            "want",
+            "unwanted",
+            "wa",
+            "un",
+            "running",
+            ",",
+            "low",
+            "l",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs["lower_case"] = True
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "<unk> UNwanted , running"
+        output_text = "<unk> unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
+
+        tokens = tokenizer.tokenize("<unk> UNwanted , running")
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+
+    def test_full_tokenizer_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+
+    def test_full_tokenizer_no_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_full_tokenizer_moses_numbers(self):
+        tokenizer = TransfoXLTokenizer(lower_case=False)
+        text_in = "Hello (bracket) and side-scrolled [and] Henry's $5,000 with 3.34 m. What's up!?"
+        tokens_out = [
+            "Hello",
+            "(",
+            "bracket",
+            ")",
+            "and",
+            "side",
+            "@-@",
+            "scrolled",
+            "[",
+            "and",
+            "]",
+            "Henry",
+            "'s",
+            "$",
+            "5",
+            "@,@",
+            "000",
+            "with",
+            "3",
+            "@.@",
+            "34",
+            "m",
+            ".",
+            "What",
+            "'s",
+            "up",
+            "!",
+            "?",
+        ]
+
+        self.assertListEqual(tokenizer.tokenize(text_in), tokens_out)
+
+        self.assertEqual(tokenizer.convert_tokens_to_string(tokens_out), text_in)
+
+    def test_move_added_token(self):
+        tokenizer = self.get_tokenizer()
+        original_len = len(tokenizer)
+
+        tokenizer.add_tokens(["new1", "new2"])
+        tokenizer.move_added_token("new1", 1)
+
+        # Check that moved token is not copied (duplicate)
+        self.assertEqual(len(tokenizer), original_len + 2)
+        # Check that token is moved to specified id
+        self.assertEqual(tokenizer.encode("new1"), [1])
+        self.assertEqual(tokenizer.decode([1]), "new1")
diff --git a/test_tokenization_utils.py b/test_tokenization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a655b84dc16c684ba64ab079ebf77a1fe2912d87
--- /dev/null
+++ b/test_tokenization_utils.py
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+isort:skip_file
+"""
+import os
+import pickle
+import tempfile
+import unittest
+from typing import Callable, Optional
+
+import numpy as np
+
+# Ensure there are no circular imports when importing the parent class
+from transformers import PreTrainedTokenizerFast
+
+from transformers import (
+    BatchEncoding,
+    BertTokenizer,
+    BertTokenizerFast,
+    PreTrainedTokenizer,
+    TensorType,
+    TokenSpan,
+    is_tokenizers_available,
+)
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow
+
+
+if is_tokenizers_available():
+    from tokenizers import Tokenizer
+    from tokenizers.models import WordPiece
+
+
+class TokenizerUtilsTest(unittest.TestCase):
+    def check_tokenizer_from_pretrained(self, tokenizer_class):
+        s3_models = list(tokenizer_class.max_model_input_sizes.keys())
+        for model_name in s3_models[:1]:
+            tokenizer = tokenizer_class.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, tokenizer_class)
+            self.assertIsInstance(tokenizer, PreTrainedTokenizer)
+
+            for special_tok in tokenizer.all_special_tokens:
+                self.assertIsInstance(special_tok, str)
+                special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
+                self.assertIsInstance(special_tok_id, int)
+
+    def assert_dump_and_restore(self, be_original: BatchEncoding, equal_op: Optional[Callable] = None):
+        batch_encoding_str = pickle.dumps(be_original)
+        self.assertIsNotNone(batch_encoding_str)
+
+        be_restored = pickle.loads(batch_encoding_str)
+
+        # Ensure is_fast is correctly restored
+        self.assertEqual(be_restored.is_fast, be_original.is_fast)
+
+        # Ensure encodings are potentially correctly restored
+        if be_original.is_fast:
+            self.assertIsNotNone(be_restored.encodings)
+        else:
+            self.assertIsNone(be_restored.encodings)
+
+        # Ensure the keys are the same
+        for original_v, restored_v in zip(be_original.values(), be_restored.values()):
+            if equal_op:
+                self.assertTrue(equal_op(restored_v, original_v))
+            else:
+                self.assertEqual(restored_v, original_v)
+
+    @slow
+    def test_pretrained_tokenizers(self):
+        self.check_tokenizer_from_pretrained(GPT2Tokenizer)
+
+    def test_tensor_type_from_str(self):
+        self.assertEqual(TensorType("tf"), TensorType.TENSORFLOW)
+        self.assertEqual(TensorType("pt"), TensorType.PYTORCH)
+        self.assertEqual(TensorType("np"), TensorType.NUMPY)
+
+    @require_tokenizers
+    def test_batch_encoding_pickle(self):
+        import numpy as np
+
+        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+        # Python no tensor
+        with self.subTest("BatchEncoding (Python, return_tensors=None)"):
+            self.assert_dump_and_restore(tokenizer_p("Small example to encode"))
+
+        with self.subTest("BatchEncoding (Python, return_tensors=NUMPY)"):
+            self.assert_dump_and_restore(
+                tokenizer_p("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal
+            )
+
+        with self.subTest("BatchEncoding (Rust, return_tensors=None)"):
+            self.assert_dump_and_restore(tokenizer_r("Small example to encode"))
+
+        with self.subTest("BatchEncoding (Rust, return_tensors=NUMPY)"):
+            self.assert_dump_and_restore(
+                tokenizer_r("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal
+            )
+
+    @require_tf
+    @require_tokenizers
+    def test_batch_encoding_pickle_tf(self):
+        import tensorflow as tf
+
+        def tf_array_equals(t1, t2):
+            return tf.reduce_all(tf.equal(t1, t2))
+
+        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+        with self.subTest("BatchEncoding (Python, return_tensors=TENSORFLOW)"):
+            self.assert_dump_and_restore(
+                tokenizer_p("Small example to encode", return_tensors=TensorType.TENSORFLOW), tf_array_equals
+            )
+
+        with self.subTest("BatchEncoding (Rust, return_tensors=TENSORFLOW)"):
+            self.assert_dump_and_restore(
+                tokenizer_r("Small example to encode", return_tensors=TensorType.TENSORFLOW), tf_array_equals
+            )
+
+    @require_torch
+    @require_tokenizers
+    def test_batch_encoding_pickle_pt(self):
+        import torch
+
+        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+        with self.subTest("BatchEncoding (Python, return_tensors=PYTORCH)"):
+            self.assert_dump_and_restore(
+                tokenizer_p("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal
+            )
+
+        with self.subTest("BatchEncoding (Rust, return_tensors=PYTORCH)"):
+            self.assert_dump_and_restore(
+                tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal
+            )
+
+    @require_tokenizers
+    def test_batch_encoding_is_fast(self):
+        tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
+        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+        with self.subTest("Python Tokenizer"):
+            self.assertFalse(tokenizer_p("Small example to_encode").is_fast)
+
+        with self.subTest("Rust Tokenizer"):
+            self.assertTrue(tokenizer_r("Small example to_encode").is_fast)
+
+    @require_tokenizers
+    def test_batch_encoding_word_to_tokens(self):
+        tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
+        encoded = tokenizer_r(["Test", "\xad", "test"], is_split_into_words=True)
+
+        self.assertEqual(encoded.word_to_tokens(0), TokenSpan(start=1, end=2))
+        self.assertEqual(encoded.word_to_tokens(1), None)
+        self.assertEqual(encoded.word_to_tokens(2), TokenSpan(start=2, end=3))
+
+    def test_batch_encoding_with_labels(self):
+        batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
+        tensor_batch = batch.convert_to_tensors(tensor_type="np")
+        self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (2,))
+        # test converting the converted
+        with CaptureStderr() as cs:
+            tensor_batch = batch.convert_to_tensors(tensor_type="np")
+        self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}")
+
+        batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
+        tensor_batch = batch.convert_to_tensors(tensor_type="np", prepend_batch_axis=True)
+        self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (1,))
+
+    @require_torch
+    def test_batch_encoding_with_labels_pt(self):
+        batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
+        tensor_batch = batch.convert_to_tensors(tensor_type="pt")
+        self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (2,))
+        # test converting the converted
+        with CaptureStderr() as cs:
+            tensor_batch = batch.convert_to_tensors(tensor_type="pt")
+        self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}")
+
+        batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
+        tensor_batch = batch.convert_to_tensors(tensor_type="pt", prepend_batch_axis=True)
+        self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (1,))
+
+    @require_tf
+    def test_batch_encoding_with_labels_tf(self):
+        batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
+        tensor_batch = batch.convert_to_tensors(tensor_type="tf")
+        self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (2,))
+        # test converting the converted
+        with CaptureStderr() as cs:
+            tensor_batch = batch.convert_to_tensors(tensor_type="tf")
+        self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}")
+
+        batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
+        tensor_batch = batch.convert_to_tensors(tensor_type="tf", prepend_batch_axis=True)
+        self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (1,))
+
+    @require_flax
+    def test_batch_encoding_with_labels_jax(self):
+        batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
+        tensor_batch = batch.convert_to_tensors(tensor_type="jax")
+        self.assertEqual(tensor_batch["inputs"].shape, (2, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (2,))
+        # test converting the converted
+        with CaptureStderr() as cs:
+            tensor_batch = batch.convert_to_tensors(tensor_type="jax")
+        self.assertFalse(len(cs.err), msg=f"should have no warning, but got {cs.err}")
+
+        batch = BatchEncoding({"inputs": [1, 2, 3], "labels": 0})
+        tensor_batch = batch.convert_to_tensors(tensor_type="jax", prepend_batch_axis=True)
+        self.assertEqual(tensor_batch["inputs"].shape, (1, 3))
+        self.assertEqual(tensor_batch["labels"].shape, (1,))
+
+    def test_padding_accepts_tensors(self):
+        features = [{"input_ids": np.array([0, 1, 2])}, {"input_ids": np.array([0, 1, 2, 3])}]
+        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+        batch = tokenizer.pad(features, padding=True)
+        self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
+        self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
+        batch = tokenizer.pad(features, padding=True, return_tensors="np")
+        self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
+        self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
+
+    @require_torch
+    def test_padding_accepts_tensors_pt(self):
+        import torch
+
+        features = [{"input_ids": torch.tensor([0, 1, 2])}, {"input_ids": torch.tensor([0, 1, 2, 3])}]
+        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+        batch = tokenizer.pad(features, padding=True)
+        self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
+        self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
+        batch = tokenizer.pad(features, padding=True, return_tensors="pt")
+        self.assertTrue(isinstance(batch["input_ids"], torch.Tensor))
+        self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
+
+    @require_tf
+    def test_padding_accepts_tensors_tf(self):
+        import tensorflow as tf
+
+        features = [{"input_ids": tf.constant([0, 1, 2])}, {"input_ids": tf.constant([0, 1, 2, 3])}]
+        tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+        batch = tokenizer.pad(features, padding=True)
+        self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
+        self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
+        batch = tokenizer.pad(features, padding=True, return_tensors="tf")
+        self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
+        self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
+
+    @require_tokenizers
+    def test_instantiation_from_tokenizers(self):
+        bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+        PreTrainedTokenizerFast(tokenizer_object=bert_tokenizer)
+
+    @require_tokenizers
+    def test_instantiation_from_tokenizers_json_file(self):
+        bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            bert_tokenizer.save(os.path.join(tmpdirname, "tokenizer.json"))
+            PreTrainedTokenizerFast(tokenizer_file=os.path.join(tmpdirname, "tokenizer.json"))
diff --git a/test_tokenization_wav2vec2.py b/test_tokenization_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..db9d302200aa36e997cfd54cc1d113db10f4ee75
--- /dev/null
+++ b/test_tokenization_wav2vec2.py
@@ -0,0 +1,556 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the Wav2Vec2 tokenizer."""
+import inspect
+import json
+import os
+import random
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import (
+    WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
+    Wav2Vec2Config,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2Tokenizer,
+)
+from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_torch, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class Wav2Vec2TokenizerTest(unittest.TestCase):
+    tokenizer_class = Wav2Vec2Tokenizer
+
+    def setUp(self):
+        super().setUp()
+
+        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+
+        self.tmpdirname = tempfile.mkdtemp()
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def test_tokenizer_decode(self):
+        # TODO(PVP) - change to facebook
+        tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+
+        sample_ids = [
+            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
+            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
+        ]
+        tokens = tokenizer.decode(sample_ids[0])
+        batch_tokens = tokenizer.batch_decode(sample_ids)
+        self.assertEqual(tokens, batch_tokens[0])
+        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
+
+    def test_tokenizer_decode_special(self):
+        # TODO(PVP) - change to facebook
+        tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+
+        sample_ids = [
+            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
+            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
+        ]
+        sample_ids_2 = [
+            [11, 5, 5, 5, 5, 5, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98],
+            [
+                24,
+                22,
+                5,
+                tokenizer.pad_token_id,
+                tokenizer.pad_token_id,
+                tokenizer.pad_token_id,
+                tokenizer.word_delimiter_token_id,
+                24,
+                22,
+                5,
+                77,
+                tokenizer.word_delimiter_token_id,
+            ],
+        ]
+
+        batch_tokens = tokenizer.batch_decode(sample_ids)
+        batch_tokens_2 = tokenizer.batch_decode(sample_ids_2)
+        self.assertEqual(batch_tokens, batch_tokens_2)
+        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
+
+    def test_tokenizer_decode_added_tokens(self):
+        tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        tokenizer.add_tokens(["!", "?"])
+        tokenizer.add_special_tokens({"cls_token": "$$$"})
+
+        sample_ids = [
+            [
+                11,
+                5,
+                15,
+                tokenizer.pad_token_id,
+                15,
+                8,
+                98,
+                32,
+                32,
+                33,
+                tokenizer.word_delimiter_token_id,
+                32,
+                32,
+                33,
+                34,
+                34,
+            ],
+            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34],
+        ]
+        batch_tokens = tokenizer.batch_decode(sample_ids)
+
+        self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        tokenizer = self.get_tokenizer()
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test not batched input
+        encoded_sequences_1 = tokenizer(speech_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = tokenizer(np_speech_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = tokenizer(speech_inputs, return_tensors="np").input_values
+        encoded_sequences_2 = tokenizer(np_speech_inputs, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_padding(self, max_length=50):
+        def _input_values_have_equal_length(input_values):
+            length = len(input_values[0])
+            for input_values_slice in input_values[1:]:
+                if len(input_values_slice) != length:
+                    return False
+            return True
+
+        def _input_values_are_equal(input_values_1, input_values_2):
+            if len(input_values_1) != len(input_values_2):
+                return False
+
+            for input_values_slice_1, input_values_slice_2 in zip(input_values_1, input_values_2):
+                if not np.allclose(np.asarray(input_values_slice_1), np.asarray(input_values_slice_2), atol=1e-3):
+                    return False
+            return True
+
+        tokenizer = self.get_tokenizer()
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        input_values_1 = tokenizer(speech_inputs).input_values
+        input_values_2 = tokenizer(speech_inputs, padding="longest").input_values
+        input_values_3 = tokenizer(speech_inputs, padding="longest", max_length=1600).input_values
+
+        self.assertFalse(_input_values_have_equal_length(input_values_1))
+        self.assertTrue(_input_values_have_equal_length(input_values_2))
+        self.assertTrue(_input_values_have_equal_length(input_values_3))
+        self.assertTrue(_input_values_are_equal(input_values_2, input_values_3))
+        self.assertTrue(len(input_values_1[0]) == 800)
+        self.assertTrue(len(input_values_2[0]) == 1200)
+        # padding should be 0.0
+        self.assertTrue(abs(sum(np.asarray(input_values_2[0])[800:])) < 1e-3)
+        self.assertTrue(abs(sum(np.asarray(input_values_2[1])[1000:])) < 1e-3)
+
+        input_values_4 = tokenizer(speech_inputs, padding="max_length").input_values
+        input_values_5 = tokenizer(speech_inputs, padding="max_length", max_length=1600).input_values
+
+        self.assertTrue(_input_values_are_equal(input_values_1, input_values_4))
+        self.assertTrue(input_values_5.shape, (3, 1600))
+        # padding should be 0.0
+        self.assertTrue(abs(sum(np.asarray(input_values_5[0])[800:1200])) < 1e-3)
+
+        input_values_6 = tokenizer(speech_inputs, pad_to_multiple_of=500).input_values
+        input_values_7 = tokenizer(speech_inputs, padding="longest", pad_to_multiple_of=500).input_values
+        input_values_8 = tokenizer(
+            speech_inputs, padding="max_length", pad_to_multiple_of=500, max_length=2400
+        ).input_values
+
+        self.assertTrue(_input_values_are_equal(input_values_1, input_values_6))
+        self.assertTrue(input_values_7.shape, (3, 1500))
+        self.assertTrue(input_values_8.shape, (3, 2500))
+        # padding should be 0.0
+        self.assertTrue(abs(sum(np.asarray(input_values_7[0])[800:])) < 1e-3)
+        self.assertTrue(abs(sum(np.asarray(input_values_7[1])[1000:])) < 1e-3)
+        self.assertTrue(abs(sum(np.asarray(input_values_7[2])[1200:])) < 1e-3)
+        self.assertTrue(abs(sum(np.asarray(input_values_8[0])[800:])) < 1e-3)
+        self.assertTrue(abs(sum(np.asarray(input_values_8[1])[1000:])) < 1e-3)
+        self.assertTrue(abs(sum(np.asarray(input_values_8[2])[1200:])) < 1e-3)
+
+    def test_save_pretrained(self):
+        pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
+        tokenizer = self.tokenizer_class.from_pretrained(pretrained_name)
+        tmpdirname2 = tempfile.mkdtemp()
+
+        tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
+        self.assertSequenceEqual(
+            sorted(tuple(VOCAB_FILES_NAMES.values()) + ("special_tokens_map.json", "added_tokens.json")),
+            sorted(tuple(x.split(os.path.sep)[-1] for x in tokenizer_files)),
+        )
+
+        # Checks everything loads correctly in the same way
+        tokenizer_p = self.tokenizer_class.from_pretrained(tmpdirname2)
+
+        # Check special tokens are set accordingly on Rust and Python
+        for key in tokenizer.special_tokens_map:
+            self.assertTrue(key in tokenizer_p.special_tokens_map)
+
+        shutil.rmtree(tmpdirname2)
+
+    def test_get_vocab(self):
+        tokenizer = self.get_tokenizer()
+        vocab_dict = tokenizer.get_vocab()
+        self.assertIsInstance(vocab_dict, dict)
+        self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
+
+        vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
+        self.assertEqual(len(vocab), len(tokenizer))
+
+        tokenizer.add_tokens(["asdfasdfasdfasdf"])
+        vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
+        self.assertEqual(len(vocab), len(tokenizer))
+
+    def test_save_and_load_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+        # Isolate this from the other tests because we save additional tokens/etc
+        tmpdirname = tempfile.mkdtemp()
+
+        sample_ids = [0, 1, 4, 8, 9, 0, 12]
+        before_tokens = tokenizer.decode(sample_ids)
+        before_vocab = tokenizer.get_vocab()
+        tokenizer.save_pretrained(tmpdirname)
+
+        after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+        after_tokens = after_tokenizer.decode(sample_ids)
+        after_vocab = after_tokenizer.get_vocab()
+
+        self.assertEqual(before_tokens, after_tokens)
+        self.assertDictEqual(before_vocab, after_vocab)
+
+        shutil.rmtree(tmpdirname)
+
+        tokenizer = self.get_tokenizer()
+
+        # Isolate this from the other tests because we save additional tokens/etc
+        tmpdirname = tempfile.mkdtemp()
+
+        before_len = len(tokenizer)
+        sample_ids = [0, 1, 4, 8, 9, 0, 12, before_len, before_len + 1, before_len + 2]
+        tokenizer.add_tokens(["?", "!"])
+        additional_special_tokens = tokenizer.additional_special_tokens
+        additional_special_tokens.append("&")
+        tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+        before_tokens = tokenizer.decode(sample_ids)
+        before_vocab = tokenizer.get_vocab()
+        tokenizer.save_pretrained(tmpdirname)
+
+        after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+        after_tokens = after_tokenizer.decode(sample_ids)
+        after_vocab = after_tokenizer.get_vocab()
+
+        self.assertEqual(before_tokens, after_tokens)
+        self.assertDictEqual(before_vocab, after_vocab)
+
+        self.assertTrue(len(tokenizer), before_len + 3)
+        self.assertTrue(len(tokenizer), len(after_tokenizer))
+        shutil.rmtree(tmpdirname)
+
+    def test_tokenizer_slow_store_full_signature(self):
+        signature = inspect.signature(self.tokenizer_class.__init__)
+        tokenizer = self.get_tokenizer()
+
+        for parameter_name, parameter in signature.parameters.items():
+            if parameter.default != inspect.Parameter.empty:
+                self.assertIn(parameter_name, tokenizer.init_kwargs)
+
+    def test_zero_mean_unit_variance_normalization(self):
+        tokenizer = self.get_tokenizer(do_normalize=True)
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        processed = tokenizer(speech_inputs, padding="longest")
+        input_values = processed.input_values
+
+        def _check_zero_mean_unit_variance(input_vector):
+            self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3)
+            self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3)
+
+        _check_zero_mean_unit_variance(input_values[0, :800])
+        _check_zero_mean_unit_variance(input_values[1, :1000])
+        _check_zero_mean_unit_variance(input_values[2])
+
+    def test_return_attention_mask(self):
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        # default case -> no attention_mask is returned
+        tokenizer = self.get_tokenizer()
+        processed = tokenizer(speech_inputs)
+        self.assertNotIn("attention_mask", processed)
+
+        # wav2vec2-lv60 -> return attention_mask
+        tokenizer = self.get_tokenizer(return_attention_mask=True)
+        processed = tokenizer(speech_inputs, padding="longest")
+
+        self.assertIn("attention_mask", processed)
+        self.assertListEqual(list(processed.attention_mask.shape), list(processed.input_values.shape))
+        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), [800, 1000, 1200])
+
+    @slow
+    @require_torch
+    def test_pretrained_checkpoints_are_set_correctly(self):
+        # this test makes sure that models that are using
+        # group norm don't have their tokenizer return the
+        # attention_mask
+        for model_id in WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST:
+            config = Wav2Vec2Config.from_pretrained(model_id)
+            tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_id)
+
+            # only "layer" feature extraction norm should make use of
+            # attention_mask
+            self.assertEqual(tokenizer.return_attention_mask, config.feat_extract_norm == "layer")
+
+
+class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = Wav2Vec2CTCTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+
+        self.tmpdirname = tempfile.mkdtemp()
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def test_tokenizer_add_token_chars(self):
+        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
+
+        # check adding a single token
+        tokenizer.add_tokens("x")
+        token_ids = tokenizer("C x A").input_ids
+        self.assertEqual(token_ids, [19, 4, 32, 4, 7])
+
+        tokenizer.add_tokens(["a", "b", "c"])
+        token_ids = tokenizer("C a A c").input_ids
+        self.assertEqual(token_ids, [19, 4, 33, 4, 7, 4, 35])
+
+        tokenizer.add_tokens(["a", "b", "c"])
+        token_ids = tokenizer("CaA c").input_ids
+        self.assertEqual(token_ids, [19, 33, 7, 4, 35])
+
+    def test_tokenizer_add_token_words(self):
+        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
+
+        # check adding a single token
+        tokenizer.add_tokens("xxx")
+        token_ids = tokenizer("C xxx A B").input_ids
+        self.assertEqual(token_ids, [19, 4, 32, 4, 7, 4, 24])
+
+        tokenizer.add_tokens(["aaa", "bbb", "ccc"])
+        token_ids = tokenizer("C aaa A ccc B B").input_ids
+        self.assertEqual(token_ids, [19, 4, 33, 4, 7, 4, 35, 4, 24, 4, 24])
+
+        tokenizer.add_tokens(["aaa", "bbb", "ccc"])
+        token_ids = tokenizer("CaaaA ccc B B").input_ids
+        self.assertEqual(token_ids, [19, 33, 7, 4, 35, 4, 24, 4, 24])
+
+    def test_tokenizer_decode(self):
+        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
+
+        sample_ids = [
+            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
+            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
+        ]
+        tokens = tokenizer.decode(sample_ids[0])
+        batch_tokens = tokenizer.batch_decode(sample_ids)
+        self.assertEqual(tokens, batch_tokens[0])
+        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
+
+    def test_tokenizer_decode_special(self):
+        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
+
+        sample_ids = [
+            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
+            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
+        ]
+        sample_ids_2 = [
+            [11, 5, 5, 5, 5, 5, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98],
+            [
+                24,
+                22,
+                5,
+                tokenizer.pad_token_id,
+                tokenizer.pad_token_id,
+                tokenizer.pad_token_id,
+                tokenizer.word_delimiter_token_id,
+                24,
+                22,
+                5,
+                77,
+                tokenizer.word_delimiter_token_id,
+            ],
+        ]
+
+        batch_tokens = tokenizer.batch_decode(sample_ids)
+        batch_tokens_2 = tokenizer.batch_decode(sample_ids_2)
+        self.assertEqual(batch_tokens, batch_tokens_2)
+        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
+
+    def test_tokenizer_decode_added_tokens(self):
+        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
+        tokenizer.add_tokens(["!", "?"])
+        tokenizer.add_special_tokens({"cls_token": "$$$"})
+
+        sample_ids = [
+            [
+                11,
+                5,
+                15,
+                tokenizer.pad_token_id,
+                15,
+                8,
+                98,
+                32,
+                32,
+                33,
+                tokenizer.word_delimiter_token_id,
+                32,
+                32,
+                33,
+                34,
+                34,
+            ],
+            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34],
+        ]
+        batch_tokens = tokenizer.batch_decode(sample_ids)
+
+        self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])
+
+    def test_special_characters_in_vocab(self):
+        sent = "ʈʰ æ æ̃ ˧ kʰ"
+
+        vocab_dict = {k: v for v, k in enumerate({phoneme for phoneme in sent.split()})}
+        vocab_file = os.path.join(self.tmpdirname, "vocab_special.json")
+
+        with open(vocab_file, "w") as f:
+            json.dump(vocab_dict, f)
+
+        tokenizer = Wav2Vec2CTCTokenizer(vocab_file)
+
+        expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
+        self.assertEqual(sent, expected_sent)
+
+        tokenizer.save_pretrained(os.path.join(self.tmpdirname, "special_tokenizer"))
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(os.path.join(self.tmpdirname, "special_tokenizer"))
+
+        expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
+        self.assertEqual(sent, expected_sent)
+
+    def test_pretrained_model_lists(self):
+        # Wav2Vec2Model has no max model length => no testing
+        pass
+
+    # overwrite from test_tokenization_common
+    def test_add_tokens_tokenizer(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                vocab_size = tokenizer.vocab_size
+                all_size = len(tokenizer)
+
+                self.assertNotEqual(vocab_size, 0)
+
+                # We usually have added tokens from the start in tests because our vocab fixtures are
+                # smaller than the original vocabs - let's not assert this
+                # self.assertEqual(vocab_size, all_size)
+
+                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+                added_toks = tokenizer.add_tokens(new_toks)
+                vocab_size_2 = tokenizer.vocab_size
+                all_size_2 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_2, 0)
+                self.assertEqual(vocab_size, vocab_size_2)
+                self.assertEqual(added_toks, len(new_toks))
+                self.assertEqual(all_size_2, all_size + len(new_toks))
+
+                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+
+                self.assertGreaterEqual(len(tokens), 4)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
+
+                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+                vocab_size_3 = tokenizer.vocab_size
+                all_size_3 = len(tokenizer)
+
+                self.assertNotEqual(vocab_size_3, 0)
+                self.assertEqual(vocab_size, vocab_size_3)
+                self.assertEqual(added_toks_2, len(new_toks_2))
+                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+                tokens = tokenizer.encode(
+                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+                )
+
+                self.assertGreaterEqual(len(tokens), 6)
+                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[0], tokens[1])
+                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
+                self.assertGreater(tokens[-3], tokens[-4])
+                self.assertEqual(tokens[0], tokenizer.eos_token_id)
+                self.assertEqual(tokens[-3], tokenizer.pad_token_id)
diff --git a/test_tokenization_xlm.py b/test_tokenization_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf0296ddd9b059bdcc3d1ce69f96ee580404a2f2
--- /dev/null
+++ b/test_tokenization_xlm.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import unittest
+
+from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
+from transformers.testing_utils import slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = XLMTokenizer
+    test_rust_tokenizer = False
+
+    def setUp(self):
+        super().setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w</w>",
+            "r</w>",
+            "t</w>",
+            "lo",
+            "low",
+            "er</w>",
+            "low</w>",
+            "lowest</w>",
+            "newer</w>",
+            "wider</w>",
+            "<unk>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
+        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [0] + text + [1]
+        assert encoded_pair == [0] + text + [1] + text_2 + [1]
diff --git a/test_tokenization_xlm_prophetnet.py b/test_tokenization_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5620477a2cc7f780e3d4ad73a9ed08b31ffcd708
--- /dev/null
+++ b/test_tokenization_xlm_prophetnet.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers.file_utils import cached_property
+from transformers.models.xlm_prophetnet.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
+from transformers.testing_utils import require_sentencepiece, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = XLMProphetNetTokenizer
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "[PAD]"
+        token_id = 0
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "[PAD]")
+        self.assertEqual(vocab_keys[1], "[CLS]")
+        self.assertEqual(vocab_keys[-1], "j")
+        self.assertEqual(len(vocab_keys), 1_012)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_012)
+
+    def test_full_tokenizer(self):
+        tokenizer = XLMProphetNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [
+                value + tokenizer.fairseq_offset
+                for value in [8, 21, 84, 55, 24, 19, 7, -9, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, -9, 4]
+            ],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "[UNK]",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "[UNK]",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def big_tokenizer(self):
+        return XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [35389, 6672, 49, 2]
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[11073, 82783, 18, 26, 82783, 549, 51540, 248, 17209, 1301, 217, 20, 215186, 1325, 147, 17209, 1301, 217, 20, 56370, 53, 122020, 20, 16477, 27, 87355, 4548, 20, 4728, 78392, 17, 159969, 18, 26, 24491, 629, 15, 538, 22704, 5439, 15, 2788, 24491, 9885, 15, 43534, 605, 15, 814, 18403, 33200, 29, 15, 43534, 24458, 12410, 111, 24966, 83669, 9637, 144068, 26, 850, 22346, 27, 147, 24966, 83669, 83490, 26, 39113, 735, 27, 689, 656, 2800, 1339, 4600, 53, 122020, 115785, 34, 816, 1339, 46887, 18, 147, 53905, 1951, 42238, 41170, 17732, 834, 436, 15, 27523, 98733, 217, 147, 5542, 4981, 930, 17347, 16, 2], [20091, 629, 94, 82786, 58, 490, 20, 1528, 84, 53905, 344, 80592, 110128, 18822, 5267, 1306, 62, 152537, 308, 7997, 401, 124427, 549, 35442, 225, 109, 15055, 25748, 147, 7119, 43712, 34, 767, 135366, 18, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [592, 63784, 119466, 17, 147808, 88214, 18, 656, 81, 32, 3296, 10280, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="microsoft/xprophetnet-large-wiki100-cased",
+            revision="1acad1643ddd54a44df6a1b797ada8373685d90e",
+        )
diff --git a/test_tokenization_xlm_roberta.py b/test_tokenization_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..3604395e6fb434096b1acb2a8304dbfe181ea20f
--- /dev/null
+++ b/test_tokenization_xlm_roberta.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = XLMRobertaTokenizer
+    rust_tokenizer_class = XLMRobertaTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "<mask>")
+        self.assertEqual(len(vocab_keys), 1_002)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_002)
+
+    def test_full_tokenizer(self):
+        tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [
+                value + tokenizer.fairseq_offset
+                for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4]
+                #                                       ^ unk: 2 + 1 = 3                  unk: 2 + 1 = 3 ^
+            ],
+        )
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    @cached_property
+    def big_tokenizer(self):
+        return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    @slow
+    def test_tokenization_base_easy_symbols(self):
+        symbols = "Hello World!"
+        original_tokenizer_encodings = [0, 35378, 6661, 38, 2]
+        # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')  # xlmr.large has same tokenizer
+        # xlmr.eval()
+        # xlmr.encode(symbols)
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenization_base_hard_symbols(self):
+        symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
+        original_tokenizer_encodings = [
+            0,
+            3293,
+            83,
+            10,
+            4552,
+            4989,
+            7986,
+            678,
+            10,
+            5915,
+            111,
+            179459,
+            124850,
+            4,
+            6044,
+            237,
+            12,
+            6,
+            5,
+            6,
+            4,
+            6780,
+            705,
+            15,
+            1388,
+            44,
+            378,
+            10114,
+            711,
+            152,
+            20,
+            6,
+            5,
+            22376,
+            642,
+            1221,
+            15190,
+            34153,
+            450,
+            5608,
+            959,
+            1119,
+            57702,
+            136,
+            186,
+            47,
+            1098,
+            29367,
+            47,
+            # 4426, # What fairseq tokenizes from "<unk>": "_<"
+            # 3678, # What fairseq tokenizes from "<unk>": "unk"
+            # 2740, # What fairseq tokenizes from "<unk>": ">"
+            3,  # What we tokenize from "<unk>": "<unk>"
+            6,  # Residue from the tokenization: an extra sentencepiece underline
+            4,
+            6044,
+            237,
+            6284,
+            50901,
+            528,
+            31,
+            90,
+            34,
+            927,
+            2,
+        ]
+        # xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')  # xlmr.large has same tokenizer
+        # xlmr.eval()
+        # xlmr.encode(symbols)
+
+        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[0, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 5428, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 12399, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 2789, 1328, 4589, 42, 122009, 115774, 23, 805, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [0, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="xlm-roberta-base",
+            revision="d9d8a8ea5eb94b1c6654ae9249df7793cd2933d3",
+        )
diff --git a/test_tokenization_xlnet.py b/test_tokenization_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..292958eec124d0f62b9011bfb683131fe5d39311
--- /dev/null
+++ b/test_tokenization_xlnet.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from transformers import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = XLNetTokenizer
+    rust_tokenizer_class = XLNetTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.sanitize_special_tokens()
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<s>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+        self.assertEqual(vocab_keys[-1], "<eod>")
+        self.assertEqual(len(vocab_keys), 1_006)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
+
+    def test_full_tokenizer(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+
+    def test_tokenizer_lower(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "",
+                "i",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "se",
+                ".",
+            ],
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
+
+    def test_tokenizer_no_lower(self):
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "se",
+                ".",
+            ],
+        )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == text + [4, 3]
+        assert encoded_pair == text + [4] + text_2 + [4, 3]
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[17, 21442, 270, 17, 10, 14645, 318, 34, 17, 4546, 3145, 787, 13, 7752, 22018, 23, 21, 17, 4546, 3145, 787, 13, 3352, 14431, 13, 5500, 11, 1176, 580, 13, 16819, 4797, 23, 17, 10, 17135, 658, 19, 457, 7932, 13, 184, 19, 3154, 17135, 6468, 19, 1404, 12269, 19, 4229, 5356, 16264, 46, 19, 17, 20545, 10395, 9, 9, 9, 11, 28, 6421, 9531, 20729, 17, 10, 353, 17022, 11, 21, 6421, 9531, 16949, 17, 10, 11509, 753, 11, 33, 95, 2421, 7385, 956, 14431, 2626, 25, 842, 7385, 4836, 21, 1429, 2272, 9855, 3120, 161, 24738, 19, 13203, 658, 218, 787, 21, 430, 18482, 847, 2637, 9, 4, 3], [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 322, 22178, 27, 1064, 22, 956, 13, 11101, 1429, 5854, 24313, 18953, 40, 422, 24366, 68, 1758, 37, 10483, 14257, 31, 207, 263, 21, 203, 3773, 25, 71, 9735, 9, 4, 3], [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 32, 2049, 3442, 17, 13894, 3380, 23, 95, 18, 17634, 2288, 9, 4, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="xlnet-base-cased",
+            revision="c841166438c31ec7ca9a106dee7bb312b73ae511",
+        )
diff --git a/test_trainer.py b/test_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbabf48bc0aa6504316016014269c69f8f9677ef
--- /dev/null
+++ b/test_trainer.py
@@ -0,0 +1,1362 @@
+# coding=utf-8
+# Copyright 2018 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import gc
+import math
+import os
+import random
+import re
+import tempfile
+import unittest
+from pathlib import Path
+
+import numpy as np
+
+from huggingface_hub import HfApi
+from requests.exceptions import HTTPError
+from transformers import AutoTokenizer, IntervalStrategy, PretrainedConfig, TrainingArguments, is_torch_available
+from transformers.file_utils import WEIGHTS_NAME
+from transformers.testing_utils import (
+    ENDPOINT_STAGING,
+    PASS,
+    USER,
+    TestCasePlus,
+    get_gpu_count,
+    get_tests_dir,
+    is_staging_test,
+    require_datasets,
+    require_optuna,
+    require_ray,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.utils.hp_naming import TrialShortNamer
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+    from torch.utils.data import IterableDataset
+
+    from transformers import (
+        AutoModelForSequenceClassification,
+        EarlyStoppingCallback,
+        GlueDataset,
+        GlueDataTrainingArguments,
+        GPT2Config,
+        GPT2LMHeadModel,
+        LineByLineTextDataset,
+        PreTrainedModel,
+        Trainer,
+        TrainerState,
+    )
+    from transformers.modeling_utils import unwrap_model
+
+
+PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt"
+
+
+class RegressionDataset:
+    def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+        np.random.seed(seed)
+        self.label_names = ["labels"] if label_names is None else label_names
+        self.length = length
+        self.x = np.random.normal(size=(length,)).astype(np.float32)
+        self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names]
+        self.ys = [y.astype(np.float32) for y in self.ys]
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        result = {name: y[i] for name, y in zip(self.label_names, self.ys)}
+        result["input_x"] = self.x[i]
+        return result
+
+
+@dataclasses.dataclass
+class RegressionTrainingArguments(TrainingArguments):
+    a: float = 0.0
+    b: float = 0.0
+
+
+class RepeatDataset:
+    def __init__(self, x, length=64):
+        self.x = x
+        self.length = length
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        return {"input_ids": self.x, "labels": self.x}
+
+
+class DynamicShapesDataset:
+    def __init__(self, length=64, seed=42, batch_size=8):
+        self.length = length
+        np.random.seed(seed)
+        sizes = np.random.randint(1, 20, (length // batch_size,))
+        # For easy batching, we make every batch_size consecutive samples the same size.
+        self.xs = [np.random.normal(size=(s,)) for s in sizes.repeat(batch_size)]
+        self.ys = [np.random.normal(size=(s,)) for s in sizes.repeat(batch_size)]
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        return {"input_x": self.xs[i], "labels": self.ys[i]}
+
+
+class AlmostAccuracy:
+    def __init__(self, thresh=0.25):
+        self.thresh = thresh
+
+    def __call__(self, eval_pred):
+        predictions, labels = eval_pred
+        true = np.abs(predictions - labels) <= self.thresh
+        return {"accuracy": true.astype(np.float32).mean().item()}
+
+
+class RegressionModelConfig(PretrainedConfig):
+    def __init__(self, a=0, b=0, double_output=False, **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+        self.double_output = double_output
+        self.hidden_size = 1
+
+
+if is_torch_available():
+
+    class SampleIterableDataset(IterableDataset):
+        def __init__(self, a=2, b=3, length=64, seed=42, label_names=None):
+            self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names)
+
+        def __iter__(self):
+            for i in range(len(self.dataset)):
+                yield self.dataset[i]
+
+    class RegressionModel(nn.Module):
+        def __init__(self, a=0, b=0, double_output=False):
+            super().__init__()
+            self.a = nn.Parameter(torch.tensor(a).float())
+            self.b = nn.Parameter(torch.tensor(b).float())
+            self.double_output = double_output
+            self.config = None
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            if labels is None:
+                return (y, y) if self.double_output else (y,)
+            loss = nn.functional.mse_loss(y, labels)
+            return (loss, y, y) if self.double_output else (loss, y)
+
+    class RegressionDictModel(nn.Module):
+        def __init__(self, a=0, b=0):
+            super().__init__()
+            self.a = nn.Parameter(torch.tensor(a).float())
+            self.b = nn.Parameter(torch.tensor(b).float())
+            self.config = None
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            result = {"output": y}
+            if labels is not None:
+                result["loss"] = nn.functional.mse_loss(y, labels)
+            return result
+
+    class RegressionPreTrainedModel(PreTrainedModel):
+        config_class = RegressionModelConfig
+        base_model_prefix = "regression"
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.a = nn.Parameter(torch.tensor(config.a).float())
+            self.b = nn.Parameter(torch.tensor(config.b).float())
+            self.double_output = config.double_output
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            if labels is None:
+                return (y, y) if self.double_output else (y,)
+            loss = nn.functional.mse_loss(y, labels)
+            return (loss, y, y) if self.double_output else (loss, y)
+
+    class RegressionRandomPreTrainedModel(PreTrainedModel):
+        config_class = RegressionModelConfig
+        base_model_prefix = "regression"
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.a = nn.Parameter(torch.tensor(config.a).float())
+            self.b = nn.Parameter(torch.tensor(config.b).float())
+
+        def forward(self, input_x, labels=None, **kwargs):
+            y = input_x * self.a + self.b
+            torch_rand = torch.randn(1).squeeze()
+            np_rand = np.random.rand()
+            rand_rand = random.random()
+
+            y += 0.05 * torch_rand + 0.05 * torch.tensor(np_rand + rand_rand)
+
+            if labels is None:
+                return (y,)
+            loss = nn.functional.mse_loss(y, labels)
+            return (loss, y)
+
+    class TstLayer(nn.Module):
+        def __init__(self, hidden_size):
+            super().__init__()
+            self.linear1 = nn.Linear(hidden_size, hidden_size)
+            self.ln1 = nn.LayerNorm(hidden_size)
+            self.linear2 = nn.Linear(hidden_size, hidden_size)
+            self.ln2 = nn.LayerNorm(hidden_size)
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+
+        def forward(self, x):
+            h = self.ln1(nn.functional.relu(self.linear1(x)))
+            h = nn.functional.relu(self.linear2(x))
+            return self.ln2(x + h + self.bias)
+
+    def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, **kwargs):
+        label_names = kwargs.get("label_names", None)
+        train_dataset = RegressionDataset(length=train_len, label_names=label_names)
+        eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
+
+        model_init = kwargs.pop("model_init", None)
+        if model_init is not None:
+            model = None
+        else:
+            if pretrained:
+                config = RegressionModelConfig(a=a, b=b, double_output=double_output)
+                model = RegressionPreTrainedModel(config)
+            else:
+                model = RegressionModel(a=a, b=b, double_output=double_output)
+
+        compute_metrics = kwargs.pop("compute_metrics", None)
+        data_collator = kwargs.pop("data_collator", None)
+        optimizers = kwargs.pop("optimizers", (None, None))
+        output_dir = kwargs.pop("output_dir", "./regression")
+
+        args = RegressionTrainingArguments(output_dir, a=a, b=b, **kwargs)
+        return Trainer(
+            model,
+            args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            compute_metrics=compute_metrics,
+            optimizers=optimizers,
+            model_init=model_init,
+        )
+
+
+class TrainerIntegrationCommon:
+    def check_saved_checkpoints(self, output_dir, freq, total, is_pretrained=True):
+        file_list = [WEIGHTS_NAME, "training_args.bin", "optimizer.pt", "scheduler.pt", "trainer_state.json"]
+        if is_pretrained:
+            file_list.append("config.json")
+        for step in range(freq, total, freq):
+            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
+            self.assertTrue(os.path.isdir(checkpoint))
+            for filename in file_list:
+                self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))
+
+    def check_best_model_has_been_loaded(
+        self, output_dir, freq, total, trainer, metric, greater_is_better=False, is_pretrained=True
+    ):
+        checkpoint = os.path.join(output_dir, f"checkpoint-{(total // freq) * freq}")
+        log_history = TrainerState.load_from_json(os.path.join(checkpoint, "trainer_state.json")).log_history
+
+        values = [d[metric] for d in log_history]
+        best_value = max(values) if greater_is_better else min(values)
+        best_checkpoint = (values.index(best_value) + 1) * freq
+        checkpoint = os.path.join(output_dir, f"checkpoint-{best_checkpoint}")
+        if is_pretrained:
+            best_model = RegressionPreTrainedModel.from_pretrained(checkpoint)
+            best_model.to(trainer.args.device)
+        else:
+            best_model = RegressionModel()
+            state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
+            best_model.load_state_dict(state_dict)
+            best_model.to(trainer.args.device)
+        self.assertTrue(torch.allclose(best_model.a, trainer.model.a))
+        self.assertTrue(torch.allclose(best_model.b, trainer.model.b))
+
+        metrics = trainer.evaluate()
+        self.assertEqual(metrics[metric], best_value)
+
+    def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
+        # We'll pop things so operate on copies.
+        state = trainer_state.copy()
+        state1 = trainer_state1.copy()
+        # Log history main contain different logs for the time metrics (after resuming a training).
+        log_history = state.pop("log_history", None)
+        log_history1 = state1.pop("log_history", None)
+        self.assertEqual(state, state1)
+        skip_log_keys = ["train_runtime", "train_samples_per_second", "train_steps_per_second", "train_loss"]
+        for log, log1 in zip(log_history, log_history1):
+            for key in skip_log_keys:
+                _ = log.pop(key, None)
+                _ = log1.pop(key, None)
+            self.assertEqual(log, log1)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
+    def setUp(self):
+        super().setUp()
+        args = TrainingArguments(".")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+        trainer = get_regression_trainer(learning_rate=0.1)
+        trainer.train()
+        self.default_trained_model = (trainer.model.a, trainer.model.b)
+
+        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
+        trainer.train()
+        self.alternate_trained_model = (trainer.model.a, trainer.model.b)
+
+    def check_trained_model(self, model, alternate_seed=False):
+        # Checks a training seeded with learning_rate = 0.1
+        (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
+        self.assertTrue(torch.allclose(model.a, a))
+        self.assertTrue(torch.allclose(model.b, b))
+
+    def test_trainer_works_with_dict(self):
+        # Edge case because Apex with mode O2 will change our models to return dicts. This test checks it doesn't break
+        # anything.
+        train_dataset = RegressionDataset()
+        eval_dataset = RegressionDataset()
+        model = RegressionDictModel()
+        args = TrainingArguments("./regression")
+        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        trainer.train()
+        _ = trainer.evaluate()
+        _ = trainer.predict(eval_dataset)
+
+    def test_evaluation_with_keys_to_drop(self):
+        config = GPT2Config(vocab_size=100, n_positions=128, n_ctx=128, n_embd=32, n_layer=3, n_head=4)
+        tiny_gpt2 = GPT2LMHeadModel(config)
+        x = torch.randint(0, 100, (128,))
+        eval_dataset = RepeatDataset(x)
+        args = TrainingArguments("./test")
+        trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
+        # By default the past_key_values are removed
+        result = trainer.predict(eval_dataset)
+        self.assertTrue(isinstance(result.predictions, np.ndarray))
+        # We can still get them by setting ignore_keys to []
+        result = trainer.predict(eval_dataset, ignore_keys=[])
+        self.assertTrue(isinstance(result.predictions, tuple))
+        self.assertEqual(len(result.predictions), 2)
+
+    def test_training_arguments_are_left_untouched(self):
+        trainer = get_regression_trainer()
+        trainer.train()
+        args = TrainingArguments("./regression")
+        dict1, dict2 = args.to_dict(), trainer.args.to_dict()
+        for key in dict1.keys():
+            # Logging dir can be slightly different as they default to something with the time.
+            if key != "logging_dir":
+                self.assertEqual(dict1[key], dict2[key])
+
+    def test_reproducible_training(self):
+        # Checks that training worked, model trained and seed made a reproducible training.
+        trainer = get_regression_trainer(learning_rate=0.1)
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+        # Checks that a different seed gets different (reproducible) results.
+        trainer = get_regression_trainer(learning_rate=0.1, seed=314)
+        trainer.train()
+        self.check_trained_model(trainer.model, alternate_seed=True)
+
+    def test_number_of_steps_in_training(self):
+        # Regular training has n_epochs * len(train_dl) steps
+        trainer = get_regression_trainer(learning_rate=0.1)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
+
+        # Check passing num_train_epochs works (and a float version too):
+        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
+
+        # If we pass a max_steps, num_train_epochs is ignored
+        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, 10)
+
+    def test_train_and_eval_dataloaders(self):
+        n_gpu = max(1, torch.cuda.device_count())
+        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16)
+        self.assertEqual(trainer.get_train_dataloader().batch_size, 16 * n_gpu)
+        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16)
+        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16 * n_gpu)
+
+        # Check drop_last works
+        trainer = get_regression_trainer(
+            train_len=66, eval_len=74, learning_rate=0.1, per_device_train_batch_size=16, per_device_eval_batch_size=32
+        )
+        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1)
+        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1)
+
+        trainer = get_regression_trainer(
+            train_len=66,
+            eval_len=74,
+            learning_rate=0.1,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=32,
+            dataloader_drop_last=True,
+        )
+        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu))
+        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu))
+
+        # Check passing a new dataset for evaluation works
+        new_eval_dataset = RegressionDataset(length=128)
+        self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu))
+
+    @require_torch_multi_gpu
+    def test_data_is_not_parallelized_when_model_is_parallel(self):
+        model = RegressionModel()
+        # Make the Trainer believe it's a parallelized model
+        model.is_parallelizable = True
+        model.model_parallel = True
+        args = TrainingArguments("./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16)
+        trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset())
+        # Check the Trainer was fooled
+        self.assertTrue(trainer.is_model_parallel)
+        self.assertEqual(trainer.args.n_gpu, 1)
+
+        # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu
+        self.assertEqual(trainer.get_train_dataloader().batch_size, 16)
+        self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16)
+        self.assertEqual(trainer.get_eval_dataloader().batch_size, 16)
+        self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
+
+    def test_evaluate(self):
+        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy())
+        results = trainer.evaluate()
+
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+        # With a number of elements not a round multiple of the batch size
+        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy())
+        results = trainer.evaluate()
+
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+    def test_predict(self):
+        trainer = get_regression_trainer(a=1.5, b=2.5)
+        preds = trainer.predict(trainer.eval_dataset).predictions
+        x = trainer.eval_dataset.x
+        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+        # With a number of elements not a round multiple of the batch size
+        trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66)
+        preds = trainer.predict(trainer.eval_dataset).predictions
+        x = trainer.eval_dataset.x
+        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+        # With more than one output of the model
+        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True)
+        preds = trainer.predict(trainer.eval_dataset).predictions
+        x = trainer.eval_dataset.x
+        self.assertTrue(len(preds), 2)
+        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+        # With more than one output/label of the model
+        trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"])
+        outputs = trainer.predict(trainer.eval_dataset)
+        preds = outputs.predictions
+        labels = outputs.label_ids
+        x = trainer.eval_dataset.x
+        self.assertTrue(len(preds), 2)
+        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+
+    def test_dynamic_shapes(self):
+        eval_dataset = DynamicShapesDataset(batch_size=self.batch_size)
+        model = RegressionModel(a=2, b=1)
+        args = TrainingArguments("./regression")
+        trainer = Trainer(model, args, eval_dataset=eval_dataset)
+
+        # Check evaluation can run to completion
+        _ = trainer.evaluate()
+
+        # Check predictions
+        preds = trainer.predict(eval_dataset)
+        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
+            self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
+            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+
+        for expected, seen in zip(eval_dataset.xs, preds.predictions):
+            self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
+            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+
+        # Same tests with eval accumulation
+        args = TrainingArguments("./regression", eval_accumulation_steps=2)
+        trainer = Trainer(model, args, eval_dataset=eval_dataset)
+
+        # Check evaluation can run to completion
+        _ = trainer.evaluate()
+
+        # Check predictions
+        preds = trainer.predict(eval_dataset)
+        for expected, seen in zip(eval_dataset.ys, preds.label_ids):
+            self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]]))
+            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+
+        for expected, seen in zip(eval_dataset.xs, preds.predictions):
+            self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]]))
+            self.assertTrue(np.all(seen[expected.shape[0] :] == -100))
+
+    @require_datasets
+    def test_trainer_with_datasets(self):
+        import datasets
+
+        np.random.seed(42)
+        x = np.random.normal(size=(64,)).astype(np.float32)
+        y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,))
+        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})
+
+        # Base training. Should have the same results as test_reproducible_training
+        model = RegressionModel()
+        args = TrainingArguments("./regression", learning_rate=0.1)
+        trainer = Trainer(model, args, train_dataset=train_dataset)
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+        # Can return tensors.
+        train_dataset.set_format(type="torch", dtype=torch.float32)
+        model = RegressionModel()
+        trainer = Trainer(model, args, train_dataset=train_dataset)
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+        # Adding one column not used by the model should have no impact
+        z = np.random.normal(size=(64,)).astype(np.float32)
+        train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
+        model = RegressionModel()
+        trainer = Trainer(model, args, train_dataset=train_dataset)
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+    def test_custom_optimizer(self):
+        train_dataset = RegressionDataset()
+        args = TrainingArguments("./regression")
+        model = RegressionModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0)
+        trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
+        trainer.train()
+
+        (a, b) = self.default_trained_model
+        self.assertFalse(torch.allclose(trainer.model.a, a))
+        self.assertFalse(torch.allclose(trainer.model.b, b))
+        self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
+
+    @require_torch
+    def test_adafactor_lr_none(self):
+        # test the special case where lr=None, since Trainer can't not have lr_scheduler
+
+        from transformers.optimization import Adafactor, AdafactorSchedule
+
+        train_dataset = RegressionDataset()
+        args = TrainingArguments("./regression")
+        model = RegressionModel()
+        optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
+        lr_scheduler = AdafactorSchedule(optimizer)
+        trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler))
+        trainer.train()
+
+        (a, b) = self.default_trained_model
+        self.assertFalse(torch.allclose(trainer.model.a, a))
+        self.assertFalse(torch.allclose(trainer.model.b, b))
+        self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
+
+    def test_model_init(self):
+        train_dataset = RegressionDataset()
+        args = TrainingArguments("./regression", learning_rate=0.1)
+        trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel())
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+        # Re-training should restart from scratch, thus lead the same results.
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+        # Re-training should restart from scratch, thus lead the same results and new seed should be used.
+        trainer.args.seed = 314
+        trainer.train()
+        self.check_trained_model(trainer.model, alternate_seed=True)
+
+    def test_save_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
+            trainer.train()
+            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
+
+        # With a regular model that is not a PreTrainedModel
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
+            trainer.train()
+            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
+
+    def test_gradient_accumulation(self):
+        # Training with half the batch size but accumulation steps as 2 should give the same results.
+        trainer = get_regression_trainer(
+            gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1
+        )
+        trainer.train()
+        self.check_trained_model(trainer.model)
+
+    @require_torch_multi_gpu
+    def test_run_seq2seq_double_train_wrap_once(self):
+        # test that we don't wrap the model more than once
+        # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
+        # example DataParallel(DataParallel(model))
+
+        trainer = get_regression_trainer()
+        trainer.train()
+        model_wrapped_before = trainer.model_wrapped
+        trainer.train()
+        model_wrapped_after = trainer.model_wrapped
+        self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")
+
+    def test_can_resume_training(self):
+        if torch.cuda.device_count() > 2:
+            # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
+            # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
+            # won't be the same since the training dataloader is shuffled).
+            return
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1)
+            trainer = get_regression_trainer(**kwargs)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+
+            # Reinitialize trainer
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+            # Now check with a later checkpoint that it also works when we span over one epoch
+            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+
+            # Reinitialize trainer and load model
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+        # With a regular model that is not a PreTrainedModel
+        with tempfile.TemporaryDirectory() as tmpdir:
+            kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, pretrained=False)
+
+            trainer = get_regression_trainer(**kwargs)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+
+            # Reinitialize trainer and load model
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+            # Now check with a later checkpoint that it also works when we span over one epoch
+            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+
+            # Reinitialize trainer and load model
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+        # Now check failures
+
+        # 1. fail to find a bogus checkpoint
+        trainer = get_regression_trainer()
+        with self.assertRaises(Exception) as context:
+            trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
+        self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
+
+        # 2. fail to find any checkpoint - due a fresh output_dir
+        output_dir2 = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=output_dir2)
+        with self.assertRaises(Exception) as context:
+            trainer.train(resume_from_checkpoint=True)
+        self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
+
+    def test_resume_training_with_randomness(self):
+        if torch.cuda.device_count() >= 2:
+            # This test will fail flakily for more than 2 GPUs since the result will be slightly more different.
+            return
+
+        if torch.cuda.is_available():
+            torch.backends.cudnn.deterministic = True
+        train_dataset = RegressionDataset(length=128)
+        eval_dataset = RegressionDataset()
+
+        config = RegressionModelConfig(a=0, b=2)
+        model = RegressionRandomPreTrainedModel(config)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        args = RegressionTrainingArguments(tmp_dir, save_steps=5, learning_rate=0.1)
+        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+
+        trainer.train()
+        (a, b) = trainer.model.a.item(), trainer.model.b.item()
+
+        model = RegressionRandomPreTrainedModel(config)
+        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15"))
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+
+        self.assertTrue(math.isclose(a, a1, rel_tol=1e-8))
+        self.assertTrue(math.isclose(b, b1, rel_tol=1e-8))
+
+    def test_resume_training_with_gradient_accumulation(self):
+        if torch.cuda.device_count() > 2:
+            # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
+            # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
+            # won't be the same since the training dataloader is shuffled).
+            return
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                train_len=128,
+                gradient_accumulation_steps=2,
+                per_device_train_batch_size=4,
+                save_steps=5,
+                learning_rate=0.1,
+            )
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+
+            # Reinitialize trainer
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                train_len=128,
+                gradient_accumulation_steps=2,
+                per_device_train_batch_size=4,
+                save_steps=5,
+                learning_rate=0.1,
+            )
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+    def test_resume_training_with_frozen_params(self):
+        if torch.cuda.device_count() > 2:
+            # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of
+            # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
+            # won't be the same since the training dataloader is shuffled).
+            return
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                train_len=128,
+                per_device_train_batch_size=4,
+                save_steps=5,
+                learning_rate=0.1,
+            )
+            trainer.model.a.requires_grad_(False)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+
+            # Reinitialize trainer
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                train_len=128,
+                per_device_train_batch_size=4,
+                save_steps=5,
+                learning_rate=0.1,
+            )
+            trainer.model.a.requires_grad_(False)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+
+            self.assertFalse(trainer.model.a.requires_grad)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+    def test_load_best_model_at_end(self):
+        total = int(self.n_epochs * 64 / self.batch_size)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                eval_steps=5,
+                evaluation_strategy="steps",
+                load_best_model_at_end=True,
+            )
+            self.assertFalse(trainer.args.greater_is_better)
+            trainer.train()
+            self.check_saved_checkpoints(tmpdir, 5, total)
+            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                eval_steps=5,
+                evaluation_strategy="steps",
+                load_best_model_at_end=True,
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+            )
+            self.assertTrue(trainer.args.greater_is_better)
+            trainer.train()
+            self.check_saved_checkpoints(tmpdir, 5, total)
+            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_accuracy", greater_is_better=True)
+
+        # Save is done every eval regardless of the strategy
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                a=1.5,
+                b=2.5,
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                evaluation_strategy="epoch",
+                load_best_model_at_end=True,
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+            )
+            self.assertTrue(trainer.args.greater_is_better)
+            trainer.train()
+            self.check_saved_checkpoints(tmpdir, 64 // self.batch_size, total)
+            self.check_best_model_has_been_loaded(
+                tmpdir, 64 // self.batch_size, total, trainer, "eval_accuracy", greater_is_better=True
+            )
+
+        # Test this works with a non PreTrainedModel
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                learning_rate=0.1,
+                eval_steps=5,
+                evaluation_strategy="steps",
+                load_best_model_at_end=True,
+                pretrained=False,
+            )
+            self.assertFalse(trainer.args.greater_is_better)
+            trainer.train()
+            self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False)
+            self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False)
+
+    @slow
+    def test_trainer_eval_mrpc(self):
+        MODEL_ID = "bert-base-cased-finetuned-mrpc"
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
+        data_args = GlueDataTrainingArguments(
+            task_name="mrpc", data_dir=f"{get_tests_dir()}/fixtures/tests_samples/MRPC", overwrite_cache=True
+        )
+        eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
+
+        training_args = TrainingArguments(output_dir="./examples", no_cuda=True)
+        trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
+        result = trainer.evaluate()
+        self.assertLess(result["eval_loss"], 0.2)
+
+    @slow
+    def test_trainer_eval_lm(self):
+        MODEL_ID = "distilroberta-base"
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        dataset = LineByLineTextDataset(
+            tokenizer=tokenizer,
+            file_path=PATH_SAMPLE_TEXT,
+            block_size=tokenizer.max_len_single_sentence,
+        )
+        self.assertEqual(len(dataset), 31)
+
+    def test_training_iterable_dataset(self):
+        config = RegressionModelConfig()
+        model = RegressionPreTrainedModel(config)
+        train_dataset = SampleIterableDataset()
+
+        args = RegressionTrainingArguments(output_dir="./examples", max_steps=4)
+        trainer = Trainer(model=model, args=args, train_dataset=train_dataset)
+        trainer.train()
+        self.assertEqual(trainer.state.global_step, 4)
+
+        loader = trainer.get_train_dataloader()
+        self.assertIsInstance(loader, torch.utils.data.DataLoader)
+        self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
+
+    def test_evaluation_iterable_dataset(self):
+        config = RegressionModelConfig(a=1.5, b=2.5)
+        model = RegressionPreTrainedModel(config)
+        eval_dataset = SampleIterableDataset()
+
+        args = RegressionTrainingArguments(output_dir="./examples")
+        trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
+        results = trainer.evaluate()
+
+        x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+        # With a number of elements not a round multiple of the batch size
+        eval_dataset = SampleIterableDataset(length=66)
+        results = trainer.evaluate(eval_dataset)
+
+        x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+    def test_predict_iterable_dataset(self):
+        config = RegressionModelConfig(a=1.5, b=2.5)
+        model = RegressionPreTrainedModel(config)
+        eval_dataset = SampleIterableDataset()
+
+        args = RegressionTrainingArguments(output_dir="./examples")
+        trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy())
+
+        preds = trainer.predict(trainer.eval_dataset).predictions
+        x = eval_dataset.dataset.x
+        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+        # With a number of elements not a round multiple of the batch size
+        test_dataset = SampleIterableDataset(length=66)
+        preds = trainer.predict(test_dataset).predictions
+        x = test_dataset.dataset.x
+        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+
+    def test_num_train_epochs_in_training(self):
+        # len(train_dl) < gradient_accumulation_steps shouldn't give ``ZeroDivisionError`` when ``max_steps`` is given.
+        # It should give 1 update step for each epoch.
+        trainer = get_regression_trainer(
+            max_steps=3, train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5
+        )
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, 3)
+
+        # Even ``max_steps`` is not specified, we still expect 1 update step for each epoch if
+        # len(train_dl) < gradient_accumulation_steps.
+        trainer = get_regression_trainer(train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, int(self.n_epochs))
+
+    def test_early_stopping_callback(self):
+        # early stopping stops training before num_training_epochs
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                num_train_epochs=20,
+                gradient_accumulation_steps=1,
+                per_device_train_batch_size=16,
+                load_best_model_at_end=True,
+                evaluation_strategy=IntervalStrategy.EPOCH,
+                compute_metrics=AlmostAccuracy(),
+                metric_for_best_model="accuracy",
+            )
+            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
+            train_output = trainer.train()
+            self.assertLess(train_output.global_step, 20 * 64 / 16)
+
+        # Invalid inputs to trainer with early stopping callback result in assertion error
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                num_train_epochs=20,
+                gradient_accumulation_steps=1,
+                per_device_train_batch_size=16,
+                evaluation_strategy=IntervalStrategy.EPOCH,
+                compute_metrics=AlmostAccuracy(),
+                metric_for_best_model="accuracy",
+            )
+            trainer.add_callback(EarlyStoppingCallback(1))
+            self.assertEqual(trainer.state.global_step, 0)
+            try:
+                trainer.train()
+            except AssertionError:
+                self.assertEqual(trainer.state.global_step, 0)
+
+    def test_flos_extraction(self):
+        trainer = get_regression_trainer(learning_rate=0.1)
+
+        def assert_flos_extraction(trainer, wrapped_model_to_check):
+            self.assertEqual(trainer.model, unwrap_model(wrapped_model_to_check))
+            self.assertGreaterEqual(getattr(unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0)
+
+        # with plain model
+        assert_flos_extraction(trainer, trainer.model)
+
+        # with enforced DataParallel
+        assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
+
+        trainer.train()
+        self.assertTrue(isinstance(trainer.state.total_flos, float))
+
+    def check_checkpoint_deletion(self, trainer, output_dir, expected):
+        # Make fake checkpoints
+        for n in [5, 10, 15, 20, 25]:
+            os.makedirs(os.path.join(output_dir, f"{PREFIX_CHECKPOINT_DIR}-{n}"), exist_ok=True)
+        trainer._rotate_checkpoints(output_dir=output_dir)
+        glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{PREFIX_CHECKPOINT_DIR}-*")]
+        values = [int(re.match(f".*{PREFIX_CHECKPOINT_DIR}-([0-9]+)", d).groups()[0]) for d in glob_checkpoints]
+        self.assertSetEqual(set(values), set(expected))
+
+    def test_checkpoint_rotation(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Without best model at end
+            trainer = get_regression_trainer(output_dir=tmp_dir, save_total_limit=2)
+            self.check_checkpoint_deletion(trainer, tmp_dir, [20, 25])
+
+            # With best model at end
+            trainer = get_regression_trainer(output_dir=tmp_dir, load_best_model_at_end=True, save_total_limit=2)
+            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
+            self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])
+
+            # Edge case: we don't always honor save_total_limit=1 if load_best_model_at_end=True to be able to resume
+            # from checkpoint
+            trainer = get_regression_trainer(output_dir=tmp_dir, load_best_model_at_end=True, save_total_limit=1)
+            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-25")
+            self.check_checkpoint_deletion(trainer, tmp_dir, [25])
+
+            trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
+            self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])
+
+    def check_mem_metrics(self, trainer, check_func):
+        metrics = trainer.train().metrics
+        check_func("init_mem_cpu_alloc_delta", metrics)
+        check_func("train_mem_cpu_alloc_delta", metrics)
+        if torch.cuda.device_count() > 0:
+            check_func("init_mem_gpu_alloc_delta", metrics)
+            check_func("train_mem_gpu_alloc_delta", metrics)
+
+        metrics = trainer.evaluate()
+        check_func("eval_mem_cpu_alloc_delta", metrics)
+        if torch.cuda.device_count() > 0:
+            check_func("eval_mem_gpu_alloc_delta", metrics)
+
+        metrics = trainer.predict(RegressionDataset()).metrics
+        check_func("test_mem_cpu_alloc_delta", metrics)
+        if torch.cuda.device_count() > 0:
+            check_func("test_mem_gpu_alloc_delta", metrics)
+
+    def test_mem_metrics(self):
+
+        # with mem metrics enabled
+        trainer = get_regression_trainer(skip_memory_metrics=False)
+        self.check_mem_metrics(trainer, self.assertIn)
+
+        # with mem metrics disabled
+        trainer = get_regression_trainer(skip_memory_metrics=True)
+        self.check_mem_metrics(trainer, self.assertNotIn)
+
+    @require_torch_gpu
+    def test_fp16_full_eval(self):
+
+        # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
+        # it's using pretty large safety margins, but small enough to detect broken functionality.
+        debug = 0
+        n_gpus = get_gpu_count()
+
+        bs = 8
+        eval_len = 16 * n_gpus
+        # make the params somewhat big so that there will be enough RAM consumed to be able to
+        # measure things. We should get about 64KB for a+b in fp32
+        a = torch.ones(1000, bs) + 0.001
+        b = torch.ones(1000, bs) - 0.001
+
+        # 1. with mem metrics enabled
+        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False)
+        metrics = trainer.evaluate()
+        del trainer
+        gc.collect()
+
+        fp32_init = metrics["init_mem_gpu_alloc_delta"]
+        fp32_eval = metrics["eval_mem_gpu_alloc_delta"]
+
+        if debug:
+            print(f"fp32_init {fp32_init}")
+            print(f"fp32_eval {fp32_eval}")
+
+        # here we expect the model to be preloaded in trainer.__init__ and consume around 64K gpu ram.
+        # perfect world: fp32_init == 64<<10
+        self.assertGreater(fp32_init, 59_000)
+        # after eval should be no extra memory allocated - with a small margin (other than the peak
+        # memory consumption for the forward calculation that gets recovered)
+        # perfect world: fp32_eval == close to zero
+        self.assertLess(fp32_eval, 5_000)
+
+        # 2. with mem metrics disabled
+        trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False)
+        metrics = trainer.evaluate()
+        fp16_init = metrics["init_mem_gpu_alloc_delta"]
+        fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
+
+        if debug:
+            print(f"fp16_init {fp16_init}")
+            print(f"fp16_eval {fp16_eval}")
+
+        # here we expect the model to not be preloaded in trainer.__init__, so with a small margin it should be close to 0
+        # perfect world: fp16_init == close to zero
+        self.assertLess(fp16_init, 5_000)
+        # here we put the model on device in eval and only `half()` of it, i.e. about 32K,(again we ignore the peak margin which gets returned back)
+        # perfect world: fp32_init == 32<<10
+        self.assertGreater(fp16_eval, 27_000)
+
+        # 3. relative comparison fp32 vs full fp16
+        # should be about half of fp16_init
+        # perfect world: fp32_init/2 == fp16_eval
+        self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
+
+    def test_no_wd_param_group(self):
+        model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
+        trainer = Trainer(model=model)
+        trainer.create_optimizer_and_scheduler(10)
+        # fmt: off
+        wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight']
+        # fmt: on
+        wd_params = [p for n, p in model.named_parameters() if n in wd_names]
+        no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names]
+        self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
+        self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
+
+
+@require_torch
+@is_staging_test
+class TrainerIntegrationWithHubTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._api = HfApi(endpoint=ENDPOINT_STAGING)
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    @classmethod
+    def tearDownClass(cls):
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-trainer")
+        except HTTPError:
+            pass
+
+        try:
+            cls._api.delete_repo(token=cls._token, name="test-trainer-org", organization="valid_org")
+        except HTTPError:
+            pass
+
+    def test_push_to_hub(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(output_dir=tmp_dir)
+            url = trainer.push_to_hub(repo_name="test-trainer", use_auth_token=self._token)
+
+            # Extract repo_name from the url
+            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
+            self.assertTrue(re_search is not None)
+            repo_name = re_search.groups()[0]
+
+            self.assertEqual(repo_name, f"{USER}/test-trainer")
+
+            model = RegressionPreTrainedModel.from_pretrained(repo_name)
+            self.assertEqual(model.a.item(), trainer.model.a.item())
+            self.assertEqual(model.b.item(), trainer.model.b.item())
+
+    def test_push_to_hub_in_organization(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(output_dir=tmp_dir)
+            trainer.save_model()
+            url = trainer.push_to_hub(
+                repo_name="test-trainer-org", organization="valid_org", use_auth_token=self._token
+            )
+
+            # Extract repo_name from the url
+            re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url)
+            self.assertTrue(re_search is not None)
+            repo_name = re_search.groups()[0]
+            self.assertEqual(repo_name, "valid_org/test-trainer-org")
+
+            model = RegressionPreTrainedModel.from_pretrained("valid_org/test-trainer-org")
+            self.assertEqual(model.a.item(), trainer.model.a.item())
+            self.assertEqual(model.b.item(), trainer.model.b.item())
+
+
+@require_torch
+@require_optuna
+class TrainerHyperParameterOptunaIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        args = TrainingArguments(".")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+
+    def test_hyperparameter_search(self):
+        class MyTrialShortNamer(TrialShortNamer):
+            DEFAULTS = {"a": 0, "b": 0}
+
+        def hp_space(trial):
+            return {}
+
+        def model_init(trial):
+            if trial is not None:
+                a = trial.suggest_int("a", -4, 4)
+                b = trial.suggest_int("b", -4, 4)
+            else:
+                a = 0
+                b = 0
+            config = RegressionModelConfig(a=a, b=b, double_output=False)
+
+            return RegressionPreTrainedModel(config)
+
+        def hp_name(trial):
+            return MyTrialShortNamer.shortname(trial.params)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                learning_rate=0.1,
+                logging_steps=1,
+                evaluation_strategy=IntervalStrategy.EPOCH,
+                num_train_epochs=4,
+                disable_tqdm=True,
+                load_best_model_at_end=True,
+                logging_dir="runs",
+                run_name="test",
+                model_init=model_init,
+            )
+            trainer.hyperparameter_search(direction="minimize", hp_space=hp_space, hp_name=hp_name, n_trials=4)
+
+
+@require_torch
+@require_ray
+class TrainerHyperParameterRayIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        args = TrainingArguments(".")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+
+    def ray_hyperparameter_search(self):
+        class MyTrialShortNamer(TrialShortNamer):
+            DEFAULTS = {"a": 0, "b": 0}
+
+        def hp_space(trial):
+            from ray import tune
+
+            return {
+                "a": tune.randint(-4, 4),
+                "b": tune.randint(-4, 4),
+            }
+
+        def model_init(config):
+            if config is None:
+                a = 0
+                b = 0
+            else:
+                a = config["a"]
+                b = config["b"]
+            model_config = RegressionModelConfig(a=a, b=b, double_output=False)
+
+            return RegressionPreTrainedModel(model_config)
+
+        def hp_name(params):
+            return MyTrialShortNamer.shortname(params)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                learning_rate=0.1,
+                logging_steps=1,
+                evaluation_strategy=IntervalStrategy.EPOCH,
+                num_train_epochs=4,
+                disable_tqdm=True,
+                load_best_model_at_end=True,
+                logging_dir="runs",
+                run_name="test",
+                model_init=model_init,
+            )
+            trainer.hyperparameter_search(
+                direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="ray", n_trials=4
+            )
+
+    def test_hyperparameter_search(self):
+        self.ray_hyperparameter_search()
+
+    def test_hyperparameter_search_ray_client(self):
+        import ray
+        from ray.util.client.ray_client_helpers import ray_start_client_server
+
+        with ray_start_client_server():
+            assert ray.util.client.ray.is_connected()
+            self.ray_hyperparameter_search()
diff --git a/test_trainer_callback.py b/test_trainer_callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce90b85546d0a4194bd5762f0f71f58375464da
--- /dev/null
+++ b/test_trainer_callback.py
@@ -0,0 +1,241 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from transformers import (
+    DefaultFlowCallback,
+    IntervalStrategy,
+    PrinterCallback,
+    ProgressCallback,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+    is_torch_available,
+)
+from transformers.testing_utils import require_torch
+
+
+if is_torch_available():
+    from transformers.trainer import DEFAULT_CALLBACKS
+
+    from .test_trainer import RegressionDataset, RegressionModelConfig, RegressionPreTrainedModel
+
+
+class MyTestTrainerCallback(TrainerCallback):
+    "A callback that registers the events that goes through."
+
+    def __init__(self):
+        self.events = []
+
+    def on_init_end(self, args, state, control, **kwargs):
+        self.events.append("on_init_end")
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        self.events.append("on_train_begin")
+
+    def on_train_end(self, args, state, control, **kwargs):
+        self.events.append("on_train_end")
+
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        self.events.append("on_epoch_begin")
+
+    def on_epoch_end(self, args, state, control, **kwargs):
+        self.events.append("on_epoch_end")
+
+    def on_step_begin(self, args, state, control, **kwargs):
+        self.events.append("on_step_begin")
+
+    def on_step_end(self, args, state, control, **kwargs):
+        self.events.append("on_step_end")
+
+    def on_evaluate(self, args, state, control, **kwargs):
+        self.events.append("on_evaluate")
+
+    def on_save(self, args, state, control, **kwargs):
+        self.events.append("on_save")
+
+    def on_log(self, args, state, control, **kwargs):
+        self.events.append("on_log")
+
+    def on_prediction_step(self, args, state, control, **kwargs):
+        self.events.append("on_prediction_step")
+
+
+@require_torch
+class TrainerCallbackTest(unittest.TestCase):
+    def setUp(self):
+        self.output_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.output_dir)
+
+    def get_trainer(self, a=0, b=0, train_len=64, eval_len=64, callbacks=None, disable_tqdm=False, **kwargs):
+        # disable_tqdm in TrainingArguments has a flaky default since it depends on the level of logging. We make sure
+        # its set to False since the tests later on depend on its value.
+        train_dataset = RegressionDataset(length=train_len)
+        eval_dataset = RegressionDataset(length=eval_len)
+        config = RegressionModelConfig(a=a, b=b)
+        model = RegressionPreTrainedModel(config)
+
+        args = TrainingArguments(self.output_dir, disable_tqdm=disable_tqdm, report_to=[], **kwargs)
+        return Trainer(
+            model,
+            args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            callbacks=callbacks,
+        )
+
+    def check_callbacks_equality(self, cbs1, cbs2):
+        self.assertEqual(len(cbs1), len(cbs2))
+
+        # Order doesn't matter
+        cbs1 = list(sorted(cbs1, key=lambda cb: cb.__name__ if isinstance(cb, type) else cb.__class__.__name__))
+        cbs2 = list(sorted(cbs2, key=lambda cb: cb.__name__ if isinstance(cb, type) else cb.__class__.__name__))
+
+        for cb1, cb2 in zip(cbs1, cbs2):
+            if isinstance(cb1, type) and isinstance(cb2, type):
+                self.assertEqual(cb1, cb2)
+            elif isinstance(cb1, type) and not isinstance(cb2, type):
+                self.assertEqual(cb1, cb2.__class__)
+            elif not isinstance(cb1, type) and isinstance(cb2, type):
+                self.assertEqual(cb1.__class__, cb2)
+            else:
+                self.assertEqual(cb1, cb2)
+
+    def get_expected_events(self, trainer):
+        expected_events = ["on_init_end", "on_train_begin"]
+        step = 0
+        train_dl_len = len(trainer.get_eval_dataloader())
+        evaluation_events = ["on_prediction_step"] * len(trainer.get_eval_dataloader()) + ["on_log", "on_evaluate"]
+        for _ in range(trainer.state.num_train_epochs):
+            expected_events.append("on_epoch_begin")
+            for _ in range(train_dl_len):
+                step += 1
+                expected_events += ["on_step_begin", "on_step_end"]
+                if step % trainer.args.logging_steps == 0:
+                    expected_events.append("on_log")
+                if trainer.args.evaluation_strategy == IntervalStrategy.STEPS and step % trainer.args.eval_steps == 0:
+                    expected_events += evaluation_events.copy()
+                if step % trainer.args.save_steps == 0:
+                    expected_events.append("on_save")
+            expected_events.append("on_epoch_end")
+            if trainer.args.evaluation_strategy == IntervalStrategy.EPOCH:
+                expected_events += evaluation_events.copy()
+        expected_events += ["on_log", "on_train_end"]
+        return expected_events
+
+    def test_init_callback(self):
+        trainer = self.get_trainer()
+        expected_callbacks = DEFAULT_CALLBACKS.copy() + [ProgressCallback]
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        # Callbacks passed at init are added to the default callbacks
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback])
+        expected_callbacks.append(MyTestTrainerCallback)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        # TrainingArguments.disable_tqdm controls if use ProgressCallback or PrinterCallback
+        trainer = self.get_trainer(disable_tqdm=True)
+        expected_callbacks = DEFAULT_CALLBACKS.copy() + [PrinterCallback]
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+    def test_add_remove_callback(self):
+        expected_callbacks = DEFAULT_CALLBACKS.copy() + [ProgressCallback]
+        trainer = self.get_trainer()
+
+        # We can add, pop, or remove by class name
+        trainer.remove_callback(DefaultFlowCallback)
+        expected_callbacks.remove(DefaultFlowCallback)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        trainer = self.get_trainer()
+        cb = trainer.pop_callback(DefaultFlowCallback)
+        self.assertEqual(cb.__class__, DefaultFlowCallback)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        trainer.add_callback(DefaultFlowCallback)
+        expected_callbacks.insert(0, DefaultFlowCallback)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        # We can also add, pop, or remove by instance
+        trainer = self.get_trainer()
+        cb = trainer.callback_handler.callbacks[0]
+        trainer.remove_callback(cb)
+        expected_callbacks.remove(DefaultFlowCallback)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        trainer = self.get_trainer()
+        cb1 = trainer.callback_handler.callbacks[0]
+        cb2 = trainer.pop_callback(cb1)
+        self.assertEqual(cb1, cb2)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+        trainer.add_callback(cb1)
+        expected_callbacks.insert(0, DefaultFlowCallback)
+        self.check_callbacks_equality(trainer.callback_handler.callbacks, expected_callbacks)
+
+    def test_event_flow(self):
+        import warnings
+
+        # XXX: for now ignore scatter_gather warnings in this test since it's not relevant to what's being tested
+        warnings.simplefilter(action="ignore", category=UserWarning)
+
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback])
+        trainer.train()
+        events = trainer.callback_handler.callbacks[-2].events
+        self.assertEqual(events, self.get_expected_events(trainer))
+
+        # Independent log/save/eval
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], logging_steps=5)
+        trainer.train()
+        events = trainer.callback_handler.callbacks[-2].events
+        self.assertEqual(events, self.get_expected_events(trainer))
+
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], save_steps=5)
+        trainer.train()
+        events = trainer.callback_handler.callbacks[-2].events
+        self.assertEqual(events, self.get_expected_events(trainer))
+
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], eval_steps=5, evaluation_strategy="steps")
+        trainer.train()
+        events = trainer.callback_handler.callbacks[-2].events
+        self.assertEqual(events, self.get_expected_events(trainer))
+
+        trainer = self.get_trainer(callbacks=[MyTestTrainerCallback], evaluation_strategy="epoch")
+        trainer.train()
+        events = trainer.callback_handler.callbacks[-2].events
+        self.assertEqual(events, self.get_expected_events(trainer))
+
+        # A bit of everything
+        trainer = self.get_trainer(
+            callbacks=[MyTestTrainerCallback],
+            logging_steps=3,
+            save_steps=10,
+            eval_steps=5,
+            evaluation_strategy="steps",
+        )
+        trainer.train()
+        events = trainer.callback_handler.callbacks[-2].events
+        self.assertEqual(events, self.get_expected_events(trainer))
+
+        # warning should be emitted for duplicated callbacks
+        with unittest.mock.patch("transformers.trainer_callback.logger.warning") as warn_mock:
+            trainer = self.get_trainer(
+                callbacks=[MyTestTrainerCallback, MyTestTrainerCallback],
+            )
+            assert str(MyTestTrainerCallback) in warn_mock.call_args[0][0]
diff --git a/test_trainer_distributed.py b/test_trainer_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40526c6de78083c8b03da7eb614bb3f9953f81d
--- /dev/null
+++ b/test_trainer_distributed.py
@@ -0,0 +1,143 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import Dict
+
+from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    get_torch_dist_unique_port,
+    require_torch_multi_gpu,
+)
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+    from torch.utils.data.dataset import Dataset
+
+    from transformers import Trainer
+
+    class DummyDataset(Dataset):
+        def __init__(self, length: int = 101):
+            self.length = length
+
+        def __len__(self):
+            return self.length
+
+        def __getitem__(self, i) -> int:
+            return i
+
+    class DummyDataCollator:
+        def __call__(self, features):
+            return {"input_ids": torch.tensor(features), "labels": torch.tensor(features)}
+
+    class DummyModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            # Add some (unused) params otherwise DDP will complain.
+            self.fc = nn.Linear(120, 80)
+
+        def forward(self, input_ids, labels=None):
+            if labels is not None:
+                return torch.tensor(0.0, device=input_ids.device), input_ids
+            else:
+                return input_ids
+
+
+class TestTrainerDistributed(TestCasePlus):
+    @require_torch_multi_gpu
+    def test_trainer(self):
+
+        distributed_args = f"""
+            -m torch.distributed.launch
+            --nproc_per_node={torch.cuda.device_count()}
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = [sys.executable] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
+if __name__ == "__main__":
+    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
+    #
+    # PYTHONPATH="src" python -m torch.distributed.launch --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
+
+    parser = HfArgumentParser((TrainingArguments,))
+    training_args = parser.parse_args_into_dataclasses()[0]
+
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.local_rank != -1}"
+    )
+
+    # Essentially, what we want to verify in the distributed case is that we get all samples back,
+    # in the right order. (this is crucial for prediction for instance)
+    for dataset_length in [101, 40, 7]:
+        dataset = DummyDataset(dataset_length)
+
+        def compute_metrics(p: EvalPrediction) -> Dict:
+            sequential = list(range(len(dataset)))
+            success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
+            if not success and training_args.local_rank == 0:
+                logger.warning(
+                    "Predictions and/or labels do not match expected results:\n  - predictions: "
+                    f"{p.predictions.tolist()}\n  - labels: {p.label_ids.tolist()}\n  - expected: {sequential}"
+                )
+            return {"success": success}
+
+        trainer = Trainer(
+            model=DummyModel(),
+            args=training_args,
+            data_collator=DummyDataCollator(),
+            eval_dataset=dataset,
+            compute_metrics=compute_metrics,
+        )
+        metrics = trainer.evaluate()
+        logger.info(metrics)
+        if metrics["eval_success"] is not True:
+            logger.error(metrics)
+            exit(1)
+
+        p = trainer.predict(dataset)
+        logger.info(p.metrics)
+        if p.metrics["test_success"] is not True:
+            logger.error(p.metrics)
+            exit(1)
+
+        trainer.args.eval_accumulation_steps = 2
+
+        metrics = trainer.evaluate()
+        logger.info(metrics)
+        if metrics["eval_success"] is not True:
+            logger.error(metrics)
+            exit(1)
+
+        p = trainer.predict(dataset)
+        logger.info(p.metrics)
+        if p.metrics["test_success"] is not True:
+            logger.error(p.metrics)
+            exit(1)
+
+        trainer.args.eval_accumulation_steps = None
diff --git a/test_trainer_seq2seq.py b/test_trainer_seq2seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..7931ca84480422a40f2926205cb37e4a1ae98481
--- /dev/null
+++ b/test_trainer_seq2seq.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2020 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import BertTokenizer, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
+from transformers.file_utils import is_datasets_available
+from transformers.testing_utils import TestCasePlus, require_datasets, require_torch, slow
+
+
+if is_datasets_available():
+    import datasets
+
+
+class Seq2seqTrainerTester(TestCasePlus):
+    @slow
+    @require_torch
+    @require_datasets
+    def test_finetune_bert2bert(self):
+        bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+        bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
+        bert2bert.config.eos_token_id = tokenizer.sep_token_id
+        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
+        bert2bert.config.max_length = 128
+
+        train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
+        val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
+
+        train_dataset = train_dataset.select(range(32))
+        val_dataset = val_dataset.select(range(16))
+
+        batch_size = 4
+
+        def _map_to_encoder_decoder_inputs(batch):
+            # Tokenizer will automatically set [BOS] <text> [EOS]
+            inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512)
+            outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128)
+            batch["input_ids"] = inputs.input_ids
+            batch["attention_mask"] = inputs.attention_mask
+
+            batch["decoder_input_ids"] = outputs.input_ids
+            batch["labels"] = outputs.input_ids.copy()
+            batch["labels"] = [
+                [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
+            ]
+            batch["decoder_attention_mask"] = outputs.attention_mask
+
+            assert all([len(x) == 512 for x in inputs.input_ids])
+            assert all([len(x) == 128 for x in outputs.input_ids])
+
+            return batch
+
+        def _compute_metrics(pred):
+            labels_ids = pred.label_ids
+            pred_ids = pred.predictions
+
+            # all unnecessary tokens are removed
+            pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+            label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
+
+            accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str)
+
+            return {"accuracy": accuracy}
+
+        # map train dataset
+        train_dataset = train_dataset.map(
+            _map_to_encoder_decoder_inputs,
+            batched=True,
+            batch_size=batch_size,
+            remove_columns=["article", "highlights"],
+        )
+        train_dataset.set_format(
+            type="torch",
+            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
+        )
+
+        # same for validation dataset
+        val_dataset = val_dataset.map(
+            _map_to_encoder_decoder_inputs,
+            batched=True,
+            batch_size=batch_size,
+            remove_columns=["article", "highlights"],
+        )
+        val_dataset.set_format(
+            type="torch",
+            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
+        )
+
+        output_dir = self.get_auto_remove_tmp_dir()
+
+        training_args = Seq2SeqTrainingArguments(
+            output_dir=output_dir,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            predict_with_generate=True,
+            evaluation_strategy="steps",
+            do_train=True,
+            do_eval=True,
+            warmup_steps=0,
+            eval_steps=2,
+            logging_steps=2,
+        )
+
+        # instantiate trainer
+        trainer = Seq2SeqTrainer(
+            model=bert2bert,
+            args=training_args,
+            compute_metrics=_compute_metrics,
+            train_dataset=train_dataset,
+            eval_dataset=val_dataset,
+            tokenizer=tokenizer,
+        )
+
+        # start training
+        trainer.train()
diff --git a/test_trainer_tpu.py b/test_trainer_tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ef90a9f1cd44173d8f9ac673fd49ef9281f929b
--- /dev/null
+++ b/test_trainer_tpu.py
@@ -0,0 +1,131 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This test is meant to be run in on an instance with TPUs like this:
+#
+#   python examples/pytorch/xla_spawn.py --num_cores=8 tests/test_trainer_tpu.py
+#
+# Replace 8 with the number of TPU cores you have.
+#
+
+import sys
+from typing import Dict
+
+from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+    from torch.utils.data.dataset import Dataset
+
+    from transformers import Trainer
+
+    class DummyDataset(Dataset):
+        def __init__(self, length: int = 101):
+            self.length = length
+
+        def __len__(self):
+            return self.length
+
+        def __getitem__(self, i) -> int:
+            return i
+
+    class DummyDataCollator:
+        def __call__(self, features):
+            return {"input_ids": torch.tensor(features), "labels": torch.tensor(features)}
+
+    class DummyModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            # Add some (unused) params otherwise DDP will complain.
+            self.fc = nn.Linear(120, 80)
+
+        def forward(self, input_ids, labels=None):
+            if labels is not None:
+                return torch.tensor(0.0, device=input_ids.device), input_ids
+            else:
+                return input_ids
+
+
+def main():
+    parser = HfArgumentParser((TrainingArguments,))
+    sys.argv += ["--output_dir", "./examples"]
+    training_args = parser.parse_args_into_dataclasses()[0]
+
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, "
+        f"tpu_num_cores: {training_args.tpu_num_cores}",
+    )
+
+    # Essentially, what we want to verify in the distributed case is
+    # that we get all samples back, in the right order.
+    # (this is crucial for prediction for instance)
+    for dataset_length in [1001, 256, 15]:
+        dataset = DummyDataset(dataset_length)
+
+        def compute_metrics(p: EvalPrediction) -> Dict:
+            sequential = list(range(len(dataset)))
+            success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
+            return {"success": success}
+
+        trainer = Trainer(
+            model=DummyModel(),
+            args=training_args,
+            data_collator=DummyDataCollator(),
+            eval_dataset=dataset,
+            compute_metrics=compute_metrics,
+        )
+        metrics = trainer.evaluate()
+        logger.info(metrics)
+        if metrics["eval_success"] is not True:
+            logger.error(metrics)
+            exit(1)
+
+        p = trainer.predict(dataset)
+        logger.info(p.metrics)
+        if p.metrics["eval_success"] is not True:
+            logger.error(p.metrics)
+            exit(1)
+
+        trainer.args.eval_accumulation_steps = 2
+
+        metrics = trainer.evaluate()
+        logger.info(metrics)
+        if metrics["eval_success"] is not True:
+            logger.error(metrics)
+            exit(1)
+
+        p = trainer.predict(dataset)
+        logger.info(p.metrics)
+        if p.metrics["eval_success"] is not True:
+            logger.error(p.metrics)
+            exit(1)
+
+        trainer.args.eval_accumulation_steps = None
+
+    logger.info("🔥 All distributed tests successful")
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_trainer_utils.py b/test_trainer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..80096742868a5f9f6bb631f9bd44e26a495dfc12
--- /dev/null
+++ b/test_trainer_utils.py
@@ -0,0 +1,394 @@
+# coding=utf-8
+# Copyright 2018 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_torch_available
+from transformers.testing_utils import require_torch
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+    from torch.utils.data import IterableDataset
+
+    from transformers.modeling_outputs import SequenceClassifierOutput
+    from transformers.tokenization_utils_base import BatchEncoding
+    from transformers.trainer_pt_utils import (
+        DistributedLengthGroupedSampler,
+        DistributedSamplerWithLoop,
+        DistributedTensorGatherer,
+        IterableDatasetShard,
+        LabelSmoother,
+        LengthGroupedSampler,
+        SequentialDistributedSampler,
+        ShardSampler,
+        get_parameter_names,
+    )
+
+    class TstLayer(nn.Module):
+        def __init__(self, hidden_size):
+            super().__init__()
+            self.linear1 = nn.Linear(hidden_size, hidden_size)
+            self.ln1 = nn.LayerNorm(hidden_size)
+            self.linear2 = nn.Linear(hidden_size, hidden_size)
+            self.ln2 = nn.LayerNorm(hidden_size)
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+
+        def forward(self, x):
+            h = self.ln1(nn.functional.relu(self.linear1(x)))
+            h = nn.functional.relu(self.linear2(x))
+            return self.ln2(x + h + self.bias)
+
+    class RandomIterableDataset(IterableDataset):
+        # For testing, an iterable dataset of random length
+        def __init__(self, p_stop=0.01, max_length=1000):
+            self.p_stop = p_stop
+            self.max_length = max_length
+            self.generator = torch.Generator()
+
+        def __iter__(self):
+            count = 0
+            stop = False
+            while not stop and count < self.max_length:
+                yield count
+                count += 1
+                number = torch.rand(1, generator=self.generator).item()
+                stop = number < self.p_stop
+
+
+@require_torch
+class TrainerUtilsTest(unittest.TestCase):
+    def test_distributed_tensor_gatherer(self):
+        # Simulate a result with a dataset of size 21, 4 processes and chunks of lengths 2, 3, 1
+        world_size = 4
+        num_samples = 21
+        input_indices = [
+            [0, 1, 6, 7, 12, 13, 18, 19],
+            [2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 0, 1],
+            [5, 11, 17, 2],
+        ]
+
+        predictions = np.random.normal(size=(num_samples, 13))
+        gatherer = DistributedTensorGatherer(world_size=world_size, num_samples=num_samples)
+        for indices in input_indices:
+            gatherer.add_arrays(predictions[indices])
+        result = gatherer.finalize()
+        self.assertTrue(np.array_equal(result, predictions))
+
+        # With nested tensors
+        gatherer = DistributedTensorGatherer(world_size=world_size, num_samples=num_samples)
+        for indices in input_indices:
+            gatherer.add_arrays([predictions[indices], [predictions[indices], predictions[indices]]])
+        result = gatherer.finalize()
+        self.assertTrue(isinstance(result, list))
+        self.assertTrue(len(result), 2)
+        self.assertTrue(isinstance(result[1], list))
+        self.assertTrue(len(result[1]), 2)
+        self.assertTrue(np.array_equal(result[0], predictions))
+        self.assertTrue(np.array_equal(result[1][0], predictions))
+        self.assertTrue(np.array_equal(result[1][1], predictions))
+
+    def test_distributed_tensor_gatherer_different_shapes(self):
+        # Simulate a result with a dataset of size 21, 4 processes and chunks of lengths 2, 3, 1
+        world_size = 4
+        num_samples = 21
+        input_indices = [
+            [0, 1, 6, 7, 12, 13, 18, 19],
+            [2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 0, 1],
+            [5, 11, 17, 2],
+        ]
+        sequence_lengths = [8, 10, 13]
+
+        predictions = np.random.normal(size=(num_samples, 13))
+        gatherer = DistributedTensorGatherer(world_size=world_size, num_samples=num_samples)
+        for indices, seq_length in zip(input_indices, sequence_lengths):
+            gatherer.add_arrays(predictions[indices, :seq_length])
+        result = gatherer.finalize()
+
+        # Remove the extra samples added at the end for a round multiple of num processes.
+        actual_indices = [input_indices[0], input_indices[1][:-2], input_indices[2][:-1]]
+        for indices, seq_length in zip(actual_indices, sequence_lengths):
+            self.assertTrue(np.array_equal(result[indices, :seq_length], predictions[indices, :seq_length]))
+
+        # With nested tensors
+        predictions = np.random.normal(size=(num_samples, 13))
+        gatherer = DistributedTensorGatherer(world_size=world_size, num_samples=num_samples)
+        for indices, seq_length in zip(input_indices, sequence_lengths):
+            gatherer.add_arrays([predictions[indices, :seq_length], predictions[indices]])
+        result = gatherer.finalize()
+
+        for indices, seq_length in zip(actual_indices, sequence_lengths):
+            self.assertTrue(np.array_equal(result[0][indices, :seq_length], predictions[indices, :seq_length]))
+        self.assertTrue(np.array_equal(result[1], predictions))
+
+        # Check if works if varying seq_length is second
+        gatherer = DistributedTensorGatherer(world_size=world_size, num_samples=num_samples)
+        for indices, seq_length in zip(input_indices, sequence_lengths):
+            gatherer.add_arrays([predictions[indices], predictions[indices, :seq_length]])
+        result = gatherer.finalize()
+
+        self.assertTrue(np.array_equal(result[0], predictions))
+        for indices, seq_length in zip(actual_indices, sequence_lengths):
+            self.assertTrue(np.array_equal(result[1][indices, :seq_length], predictions[indices, :seq_length]))
+
+    def test_label_smoothing(self):
+        epsilon = 0.1
+        num_labels = 12
+        random_logits = torch.randn(4, 5, num_labels)
+        random_labels = torch.randint(0, num_labels, (4, 5))
+        loss = nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1))
+        model_output = SequenceClassifierOutput(logits=random_logits)
+        label_smoothed_loss = LabelSmoother(0.1)(model_output, random_labels)
+        log_probs = -nn.functional.log_softmax(random_logits, dim=-1)
+        expected_loss = (1 - epsilon) * loss + epsilon * log_probs.mean()
+        self.assertTrue(torch.allclose(label_smoothed_loss, expected_loss))
+
+        # With a few -100 labels
+        random_labels[0, 1] = -100
+        random_labels[2, 1] = -100
+        random_labels[2, 3] = -100
+
+        loss = nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1))
+        model_output = SequenceClassifierOutput(logits=random_logits)
+        label_smoothed_loss = LabelSmoother(0.1)(model_output, random_labels)
+        log_probs = -nn.functional.log_softmax(random_logits, dim=-1)
+        # Mask the log probs with the -100 labels
+        log_probs[0, 1] = 0.0
+        log_probs[2, 1] = 0.0
+        log_probs[2, 3] = 0.0
+        expected_loss = (1 - epsilon) * loss + epsilon * log_probs.sum() / (num_labels * 17)
+        self.assertTrue(torch.allclose(label_smoothed_loss, expected_loss))
+
+    def test_group_by_length(self):
+        # Get some inputs of random lengths
+        lengths = torch.randint(0, 25, (100,)).tolist()
+        # Put one bigger than the others to check it ends up in first position
+        lengths[32] = 50
+
+        indices = list(LengthGroupedSampler(lengths, 4, lengths=lengths))
+        # The biggest element should be first
+        self.assertEqual(lengths[indices[0]], 50)
+        # The indices should be a permutation of range(100)
+        self.assertEqual(list(sorted(indices)), list(range(100)))
+
+    def test_group_by_length_with_dict(self):
+        # Get some inputs of random lengths
+        data = []
+        for _ in range(6):
+            input_ids = torch.randint(0, 25, (100,)).tolist()
+            data.append({"input_ids": input_ids})
+        # Put one bigger than the others to check it ends up in first position
+        data[3]["input_ids"] = torch.randint(0, 25, (105,)).tolist()
+
+        indices = list(LengthGroupedSampler(data, 4))
+        # The biggest element should be first
+        self.assertEqual(len(data[indices[0]]["input_ids"]), 105)
+        # The indices should be a permutation of range(6)
+        self.assertEqual(list(sorted(indices)), list(range(6)))
+
+    def test_group_by_length_with_batch_encoding(self):
+        # Get some inputs of random lengths
+        data = []
+        for _ in range(6):
+            input_ids = torch.randint(0, 25, (100,)).tolist()
+            data.append(BatchEncoding({"input_ids": input_ids}))
+        # Put one bigger than the others to check it ends up in first position
+        data[3]["input_ids"] = torch.randint(0, 25, (105,)).tolist()
+
+        indices = list(LengthGroupedSampler(data, 4))
+        # The biggest element should be first
+        self.assertEqual(len(data[indices[0]]["input_ids"]), 105)
+        # The indices should be a permutation of range(6)
+        self.assertEqual(list(sorted(indices)), list(range(6)))
+
+    def test_distributed_length_grouped(self):
+        # Get some inputs of random lengths
+        lengths = torch.randint(0, 25, (100,)).tolist()
+        # Put one bigger than the others to check it ends up in first position
+        lengths[32] = 50
+
+        indices_process_0 = list(DistributedLengthGroupedSampler(lengths, 4, 2, 0, lengths=lengths))
+        indices_process_1 = list(DistributedLengthGroupedSampler(lengths, 4, 2, 1, lengths=lengths))
+        # The biggest element should be first
+        self.assertEqual(lengths[indices_process_0[0]], 50)
+        # The indices should be a permutation of range(100)
+        self.assertEqual(list(sorted(indices_process_0 + indices_process_1)), list(range(100)))
+
+    def test_get_parameter_names(self):
+        model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
+        # fmt: off
+        self.assertEqual(
+            get_parameter_names(model, [nn.LayerNorm]),
+            ['0.linear1.weight', '0.linear1.bias', '0.linear2.weight', '0.linear2.bias', '0.bias', '1.0.linear1.weight', '1.0.linear1.bias', '1.0.linear2.weight', '1.0.linear2.bias', '1.0.bias', '1.1.linear1.weight', '1.1.linear1.bias', '1.1.linear2.weight', '1.1.linear2.bias', '1.1.bias']
+        )
+        # fmt: on
+
+    def test_distributed_sampler_with_loop(self):
+        batch_size = 16
+        for length in [23, 64, 123]:
+            dataset = list(range(length))
+            shard1 = DistributedSamplerWithLoop(dataset, batch_size, num_replicas=2, rank=0)
+            shard2 = DistributedSamplerWithLoop(dataset, batch_size, num_replicas=2, rank=1)
+
+            # Set seeds
+            shard1.set_epoch(0)
+            shard2.set_epoch(0)
+
+            # Sample
+            samples1 = list(shard1)
+            samples2 = list(shard2)
+
+            self.assertTrue(len(samples1) % batch_size == 0)
+            self.assertTrue(len(samples2) % batch_size == 0)
+
+            total = []
+            for sample1, sample2 in zip(samples1, samples2):
+                total += [sample1, sample2]
+
+            self.assertEqual(set(total[:length]), set(dataset))
+            self.assertEqual(set(total[length:]), set(total[: (len(total) - length)]))
+
+    def test_sequential_distributed_sampler(self):
+        batch_size = 16
+        for length in [23, 64, 123]:
+            dataset = list(range(length))
+            shard1 = SequentialDistributedSampler(dataset, num_replicas=2, rank=0)
+            shard2 = SequentialDistributedSampler(dataset, num_replicas=2, rank=1)
+
+            # Sample
+            samples1 = list(shard1)
+            samples2 = list(shard2)
+
+            total = samples1 + samples2
+
+            self.assertListEqual(total[:length], dataset)
+            self.assertListEqual(total[length:], dataset[: (len(total) - length)])
+
+            # With a batch_size passed
+            shard1 = SequentialDistributedSampler(dataset, num_replicas=2, rank=0, batch_size=batch_size)
+            shard2 = SequentialDistributedSampler(dataset, num_replicas=2, rank=1, batch_size=batch_size)
+
+            # Sample
+            samples1 = list(shard1)
+            samples2 = list(shard2)
+
+            self.assertTrue(len(samples1) % batch_size == 0)
+            self.assertTrue(len(samples2) % batch_size == 0)
+
+            total = samples1 + samples2
+
+            self.assertListEqual(total[:length], dataset)
+            self.assertListEqual(total[length:], dataset[: (len(total) - length)])
+
+    def check_iterable_dataset_shard(self, dataset, batch_size, drop_last, num_processes=2, epoch=0):
+        # Set the seed for the base dataset to get the proper reference.
+        dataset.generator.manual_seed(epoch)
+        reference = list(dataset)
+
+        shards = [
+            IterableDatasetShard(
+                dataset, batch_size=batch_size, drop_last=drop_last, num_processes=num_processes, process_index=i
+            )
+            for i in range(num_processes)
+        ]
+        for shard in shards:
+            shard.set_epoch(epoch)
+        shard_lists = [list(shard) for shard in shards]
+
+        for shard in shard_lists:
+            # All shards have a number of samples that is a round multiple of batch size
+            self.assertTrue(len(shard) % batch_size == 0)
+            # All shards have the same number of samples
+            self.assertEqual(len(shard), len(shard_lists[0]))
+
+        for shard in shards:
+            # All shards know the total number of samples
+            self.assertEqual(shard.num_examples, len(reference))
+
+        observed = []
+        for idx in range(0, len(shard_lists[0]), batch_size):
+            for shard in shard_lists:
+                observed += shard[idx : idx + batch_size]
+
+        # If drop_last is False we loop through samples at the beginning to have a size that is a round multiple of
+        # batch_size
+        if not drop_last:
+            while len(reference) < len(observed):
+                reference += reference
+        self.assertListEqual(observed, reference[: len(observed)])
+
+        # Check equivalence between IterableDataset and ShardSampler
+        dataset.generator.manual_seed(epoch)
+        reference = list(dataset)
+
+        sampler_shards = [
+            ShardSampler(
+                reference, batch_size=batch_size, drop_last=drop_last, num_processes=num_processes, process_index=i
+            )
+            for i in range(num_processes)
+        ]
+        for shard, sampler_shard in zip(shard_lists, sampler_shards):
+            self.assertListEqual(shard, list(sampler_shard))
+
+    def test_iterable_dataset_shard(self):
+        dataset = RandomIterableDataset()
+
+        self.check_iterable_dataset_shard(dataset, 4, drop_last=True, num_processes=2, epoch=0)
+        self.check_iterable_dataset_shard(dataset, 4, drop_last=False, num_processes=2, epoch=0)
+
+        self.check_iterable_dataset_shard(dataset, 4, drop_last=True, num_processes=3, epoch=42)
+        self.check_iterable_dataset_shard(dataset, 4, drop_last=False, num_processes=3, epoch=42)
+
+    def check_shard_sampler(self, dataset, batch_size, drop_last, num_processes=2):
+        shards = [
+            ShardSampler(
+                dataset, batch_size=batch_size, drop_last=drop_last, num_processes=num_processes, process_index=i
+            )
+            for i in range(num_processes)
+        ]
+        shard_lists = [list(shard) for shard in shards]
+
+        for shard in shard_lists:
+            # All shards have a number of samples that is a round multiple of batch size
+            self.assertTrue(len(shard) % batch_size == 0)
+            # All shards have the same number of samples
+            self.assertEqual(len(shard), len(shard_lists[0]))
+
+        observed = []
+        for idx in range(0, len(shard_lists[0]), batch_size):
+            for shard in shard_lists:
+                observed += shard[idx : idx + batch_size]
+
+        # If drop_last is False we loop through samples at the beginning to have a size that is a round multiple of
+        # batch_size
+        reference = copy.copy(dataset)
+        if not drop_last:
+            while len(reference) < len(observed):
+                reference += reference
+        self.assertListEqual(observed, reference[: len(observed)])
+
+    def test_shard_sampler(self):
+        for n_elements in [64, 123]:
+            dataset = list(range(n_elements))
+
+            self.check_shard_sampler(dataset, 4, drop_last=True, num_processes=2)
+            self.check_shard_sampler(dataset, 4, drop_last=False, num_processes=2)
+
+            self.check_shard_sampler(dataset, 4, drop_last=True, num_processes=3)
+            self.check_shard_sampler(dataset, 4, drop_last=False, num_processes=3)
diff --git a/test_utils_check_copies.py b/test_utils_check_copies.py
new file mode 100644
index 0000000000000000000000000000000000000000..067bd45efaf15eed62178a7094ba64db8411b6d8
--- /dev/null
+++ b/test_utils_check_copies.py
@@ -0,0 +1,122 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import shutil
+import sys
+import tempfile
+import unittest
+
+import black
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+import check_copies  # noqa: E402
+
+
+# This is the reference code that will be used in the tests.
+# If BertLMPredictionHead is changed in modeling_bert.py, this code needs to be manually updated.
+REFERENCE_CODE = """    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+"""
+
+
+class CopyCheckTester(unittest.TestCase):
+    def setUp(self):
+        self.transformer_dir = tempfile.mkdtemp()
+        os.makedirs(os.path.join(self.transformer_dir, "models/bert/"))
+        check_copies.TRANSFORMER_PATH = self.transformer_dir
+        shutil.copy(
+            os.path.join(git_repo_path, "src/transformers/models/bert/modeling_bert.py"),
+            os.path.join(self.transformer_dir, "models/bert/modeling_bert.py"),
+        )
+
+    def tearDown(self):
+        check_copies.TRANSFORMER_PATH = "src/transformers"
+        shutil.rmtree(self.transformer_dir)
+
+    def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None):
+        code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code
+        if overwrite_result is not None:
+            expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result
+        code = black.format_str(code, mode=black.FileMode([black.TargetVersion.PY35], line_length=119))
+        fname = os.path.join(self.transformer_dir, "new_code.py")
+        with open(fname, "w", newline="\n") as f:
+            f.write(code)
+        if overwrite_result is None:
+            self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0)
+        else:
+            check_copies.is_copy_consistent(f.name, overwrite=True)
+            with open(fname, "r") as f:
+                self.assertTrue(f.read(), expected)
+
+    def test_find_code_in_transformers(self):
+        code = check_copies.find_code_in_transformers("models.bert.modeling_bert.BertLMPredictionHead")
+        self.assertEqual(code, REFERENCE_CODE)
+
+    def test_is_copy_consistent(self):
+        # Base copy consistency
+        self.check_copy_consistency(
+            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead",
+            "BertLMPredictionHead",
+            REFERENCE_CODE + "\n",
+        )
+
+        # With no empty line at the end
+        self.check_copy_consistency(
+            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead",
+            "BertLMPredictionHead",
+            REFERENCE_CODE,
+        )
+
+        # Copy consistency with rename
+        self.check_copy_consistency(
+            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->TestModel",
+            "TestModelLMPredictionHead",
+            re.sub("Bert", "TestModel", REFERENCE_CODE),
+        )
+
+        # Copy consistency with a really long name
+        long_class_name = "TestModelWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason"
+        self.check_copy_consistency(
+            f"# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->{long_class_name}",
+            f"{long_class_name}LMPredictionHead",
+            re.sub("Bert", long_class_name, REFERENCE_CODE),
+        )
+
+        # Copy consistency with overwrite
+        self.check_copy_consistency(
+            "# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->TestModel",
+            "TestModelLMPredictionHead",
+            REFERENCE_CODE,
+            overwrite_result=re.sub("Bert", "TestModel", REFERENCE_CODE),
+        )
diff --git a/test_versions_utils.py b/test_versions_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd77218d69feb6743f6e2045f08dae4b92bd757
--- /dev/null
+++ b/test_versions_utils.py
@@ -0,0 +1,97 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from transformers.testing_utils import TestCasePlus
+from transformers.utils.versions import importlib_metadata, require_version, require_version_core
+
+
+numpy_ver = importlib_metadata.version("numpy")
+python_ver = ".".join([str(x) for x in sys.version_info[:3]])
+
+
+class DependencyVersionCheckTest(TestCasePlus):
+    def test_core(self):
+        # lt + different version strings
+        require_version_core("numpy<1000.4.5")
+        require_version_core("numpy<1000.4")
+        require_version_core("numpy<1000")
+
+        # le
+        require_version_core("numpy<=1000.4.5")
+        require_version_core(f"numpy<={numpy_ver}")
+
+        # eq
+        require_version_core(f"numpy=={numpy_ver}")
+
+        # ne
+        require_version_core("numpy!=1000.4.5")
+
+        # ge
+        require_version_core("numpy>=1.0")
+        require_version_core("numpy>=1.0.0")
+        require_version_core(f"numpy>={numpy_ver}")
+
+        # gt
+        require_version_core("numpy>1.0.0")
+
+        # mix
+        require_version_core("numpy>1.0.0,<1000")
+
+        # requirement w/o version
+        require_version_core("numpy")
+
+        # unmet requirements due to version conflict
+        for req in ["numpy==1.0.0", "numpy>=1000.0.0", f"numpy<{numpy_ver}"]:
+            try:
+                require_version_core(req)
+            except ImportError as e:
+                self.assertIn(f"{req} is required", str(e))
+                self.assertIn("but found", str(e))
+
+        # unmet requirements due to missing module
+        for req in ["numpipypie>1", "numpipypie2"]:
+            try:
+                require_version_core(req)
+            except importlib_metadata.PackageNotFoundError as e:
+                self.assertIn(f"The '{req}' distribution was not found and is required by this application", str(e))
+                self.assertIn("Try: pip install transformers -U", str(e))
+
+        # bogus requirements formats:
+        # 1. whole thing
+        for req in ["numpy??1.0.0", "numpy1.0.0"]:
+            try:
+                require_version_core(req)
+            except ValueError as e:
+                self.assertIn("requirement needs to be in the pip package format", str(e))
+        # 2. only operators
+        for req in ["numpy=1.0.0", "numpy == 1.00", "numpy<>1.0.0", "numpy><1.00", "numpy>>1.0.0"]:
+            try:
+                require_version_core(req)
+            except ValueError as e:
+                self.assertIn("need one of ", str(e))
+
+    def test_python(self):
+
+        # matching requirement
+        require_version("python>=3.6.0")
+
+        # not matching requirements
+        for req in ["python>9.9.9", "python<3.0.0"]:
+            try:
+                require_version_core(req)
+            except ImportError as e:
+                self.assertIn(f"{req} is required", str(e))
+                self.assertIn(f"but found python=={python_ver}", str(e))