diff --git "a/efficient_training.ipynb" "b/efficient_training.ipynb"
new file mode 100644--- /dev/null
+++ "b/efficient_training.ipynb"
@@ -0,0 +1,4371 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Fine-tune a language model with dataset streaming and 8-bit optimizers",
+      "provenance": [],
+      "collapsed_sections": [],
+      "include_colab_link": true
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU",
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "372609dca95b4ddcb51491283df860f5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_e0b881dd26d54c7c92ba9ab5923fab10",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_90eb62f7ec634e098db511a5995d807f",
+              "IPY_MODEL_d60a799761f649378d17a044362e55b9",
+              "IPY_MODEL_a76cf6149c6748a5aa74ada58921d31e"
+            ]
+          }
+        },
+        "e0b881dd26d54c7c92ba9ab5923fab10": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "90eb62f7ec634e098db511a5995d807f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_5256dfd69e364597a29b2ad61c01ea93",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: ",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_1a231c5cffbb4225941d02cb5e3bb273"
+          }
+        },
+        "d60a799761f649378d17a044362e55b9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_cffbeabee69446c48dfa89ac38d9f45e",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 1376,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 1376,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_0c02af4a252e40fba08c097f59926dc8"
+          }
+        },
+        "a76cf6149c6748a5aa74ada58921d31e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_2917b5df9cc14ec3a2fe356c12c8511e",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 3.29k/? [00:00&lt;00:00, 76.9kB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_5c3abd7a7f354ac0b1cd3d89506f417a"
+          }
+        },
+        "5256dfd69e364597a29b2ad61c01ea93": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "1a231c5cffbb4225941d02cb5e3bb273": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "cffbeabee69446c48dfa89ac38d9f45e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "0c02af4a252e40fba08c097f59926dc8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "2917b5df9cc14ec3a2fe356c12c8511e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "5c3abd7a7f354ac0b1cd3d89506f417a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "8714bb6e944345b98b691f40adf0bf76": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_a2983efe78b94919891d6db909100934",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_b61fb9c3745f4c468954713b07f8f16f",
+              "IPY_MODEL_5f05edadc8c943aa82a790e815253a48",
+              "IPY_MODEL_867c7dd23eb64a1b8d02a7cf8a4ad64a"
+            ]
+          }
+        },
+        "a2983efe78b94919891d6db909100934": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "b61fb9c3745f4c468954713b07f8f16f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_01445fea254a436fa464b6006f3abd92",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: ",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_74c4d6d598bf4173999a56fa849506d3"
+          }
+        },
+        "5f05edadc8c943aa82a790e815253a48": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_e299d23c03444805b0a359eb049cddb3",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 492167,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 492167,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_c2f4080d692a46debfb5a8000d2637d6"
+          }
+        },
+        "867c7dd23eb64a1b8d02a7cf8a4ad64a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_b0cbc589d149494cbb74139bedb0aafc",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 2.40M/? [00:00&lt;00:00, 18.6MB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_b3fec901d0cc48a2862b900928b681f3"
+          }
+        },
+        "01445fea254a436fa464b6006f3abd92": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "74c4d6d598bf4173999a56fa849506d3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "e299d23c03444805b0a359eb049cddb3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "c2f4080d692a46debfb5a8000d2637d6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "b0cbc589d149494cbb74139bedb0aafc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "b3fec901d0cc48a2862b900928b681f3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "619629ebf5fc4e3ba9ad49e7e767a37d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_87a5f7bca18e4a818d6d48e15bc68845",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_75d276bd972d429483af64d1eb27624c",
+              "IPY_MODEL_6f778a9c54f042199869eb16563bb933",
+              "IPY_MODEL_275f3cf6de1f49c1ae1f09991b4ea99c"
+            ]
+          }
+        },
+        "87a5f7bca18e4a818d6d48e15bc68845": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "75d276bd972d429483af64d1eb27624c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_670d440077b34a8e86bf2d620ab6fb6d",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: 100%",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_ac7455b1e5e5475cabf4dab505603a43"
+          }
+        },
+        "6f778a9c54f042199869eb16563bb933": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_4e82c37304d446d88e852553cbe9acb9",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 666,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 666,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_213fc01fa39b4b77ba211a9304c5ea86"
+          }
+        },
+        "275f3cf6de1f49c1ae1f09991b4ea99c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_a9dae405fbce41e3a46b05292313bab0",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 666/666 [00:00&lt;00:00, 15.4kB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_272807a127d54f5ea3f20c0bc7262a25"
+          }
+        },
+        "670d440077b34a8e86bf2d620ab6fb6d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "ac7455b1e5e5475cabf4dab505603a43": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "4e82c37304d446d88e852553cbe9acb9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "213fc01fa39b4b77ba211a9304c5ea86": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "a9dae405fbce41e3a46b05292313bab0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "272807a127d54f5ea3f20c0bc7262a25": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "1ce665de582c4bd392d6bc4fff9a1499": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_36cdf4ebc5684e9e88d80d5c98d86154",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_288d1670c87e469ba3fe7c0030887d19",
+              "IPY_MODEL_eeb0bf36fb93410f82ab96b5414b6c50",
+              "IPY_MODEL_3c8d0050e0904acf933d18316000ac8a"
+            ]
+          }
+        },
+        "36cdf4ebc5684e9e88d80d5c98d86154": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "288d1670c87e469ba3fe7c0030887d19": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_3be14001505b4dd19293365cb4a52cbc",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: 100%",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_4195ef4de4ba491995e28b72e4d82a5b"
+          }
+        },
+        "eeb0bf36fb93410f82ab96b5414b6c50": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_cafc6a76a9ac4de79b46969ebab90265",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 1042301,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 1042301,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_9e6e0d9075454e17932e147678c1cef6"
+          }
+        },
+        "3c8d0050e0904acf933d18316000ac8a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_7c83e50b67374fa5a2d713c036ba8e84",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 0.99M/0.99M [00:00&lt;00:00, 1.42MB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_0e0d370b48c84349a7f7cc45ed5f9d09"
+          }
+        },
+        "3be14001505b4dd19293365cb4a52cbc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "4195ef4de4ba491995e28b72e4d82a5b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "cafc6a76a9ac4de79b46969ebab90265": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "9e6e0d9075454e17932e147678c1cef6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "7c83e50b67374fa5a2d713c036ba8e84": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "0e0d370b48c84349a7f7cc45ed5f9d09": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "2222296ad8d14cd39e8af17c108ad5ed": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_e39daeca11584766a42f9bc6df76089c",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_8fad1d6738b84704a3b4d248dbcfac2b",
+              "IPY_MODEL_405c42d0df19419e8563b00a3f402ae9",
+              "IPY_MODEL_7e2ceabdd41446cc860e35f94e02de31"
+            ]
+          }
+        },
+        "e39daeca11584766a42f9bc6df76089c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "8fad1d6738b84704a3b4d248dbcfac2b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_5b2f0da9535f46d78999e11162e16666",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: 100%",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_d47fc8e1a9a84efcb6d9a02ba973e940"
+          }
+        },
+        "405c42d0df19419e8563b00a3f402ae9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_a127db6cdeae4700834c1dc582ecb609",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 456318,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 456318,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_b9a0b5fe13134774a7560a583513c7cd"
+          }
+        },
+        "7e2ceabdd41446cc860e35f94e02de31": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_66d58d9dab054294a81601572b143c00",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 446k/446k [00:00&lt;00:00, 1.46MB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_75f479bbe938484891214f27986364eb"
+          }
+        },
+        "5b2f0da9535f46d78999e11162e16666": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "d47fc8e1a9a84efcb6d9a02ba973e940": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "a127db6cdeae4700834c1dc582ecb609": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "b9a0b5fe13134774a7560a583513c7cd": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "66d58d9dab054294a81601572b143c00": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "75f479bbe938484891214f27986364eb": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "05c7977a9322499bbc00e80f0d767bee": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_9a76a0ee943343d781caf5b30ce0c6a5",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_f454951d36c74e13bc3cf04b41388345",
+              "IPY_MODEL_1974ea03fa42428fa162eac65f8d71e4",
+              "IPY_MODEL_96cdea5bf8ba44f3a37e5e1f70cbef62"
+            ]
+          }
+        },
+        "9a76a0ee943343d781caf5b30ce0c6a5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "f454951d36c74e13bc3cf04b41388345": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_221d95935ae249bb8fbf60007ed39e82",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: 100%",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_0a5ca2e268c24ae1a9e42d5c9dcadb29"
+          }
+        },
+        "1974ea03fa42428fa162eac65f8d71e4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_997827c35e02494d82209fa5f1232e5f",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 1355256,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 1355256,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_aaf3281319a94b42ba21941f41d262bc"
+          }
+        },
+        "96cdea5bf8ba44f3a37e5e1f70cbef62": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_ffa4cf74c0c84e0797de60e50e1f579c",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 1.29M/1.29M [00:00&lt;00:00, 5.20MB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_d60228cfbd9f44ddbcefb20a74d99779"
+          }
+        },
+        "221d95935ae249bb8fbf60007ed39e82": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "0a5ca2e268c24ae1a9e42d5c9dcadb29": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "997827c35e02494d82209fa5f1232e5f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "aaf3281319a94b42ba21941f41d262bc": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "ffa4cf74c0c84e0797de60e50e1f579c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "d60228cfbd9f44ddbcefb20a74d99779": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "8bff97fa5703438685342e1b140234f5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_542ccbd2abf941fea5312b72c51ec5b6",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_f71eb8c60e4b4ef6b44b9f6b90bf25c5",
+              "IPY_MODEL_1355930a0edf40e5ac75198e9d901bfb",
+              "IPY_MODEL_ce047a309f984458859ddfc15aad1125"
+            ]
+          }
+        },
+        "542ccbd2abf941fea5312b72c51ec5b6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "f71eb8c60e4b4ef6b44b9f6b90bf25c5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_c82b029db5a547f09d1b9607a5d26ecc",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "Downloading: 100%",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_01312ccfa1254c7a8acc6a7410af711a"
+          }
+        },
+        "1355930a0edf40e5ac75198e9d901bfb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_7db4a94408b94d71ab9f6fccded0f011",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "success",
+            "max": 3247202234,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 3247202234,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_356eaaf643894327a0ba30049c3a2b92"
+          }
+        },
+        "ce047a309f984458859ddfc15aad1125": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_874af3844342411096d6d8872354f531",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 3.02G/3.02G [02:35&lt;00:00, 10.9MB/s]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_fd8cb9272f3d4ff9ae673c724a6b757b"
+          }
+        },
+        "c82b029db5a547f09d1b9607a5d26ecc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "01312ccfa1254c7a8acc6a7410af711a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "7db4a94408b94d71ab9f6fccded0f011": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "356eaaf643894327a0ba30049c3a2b92": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "874af3844342411096d6d8872354f531": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "fd8cb9272f3d4ff9ae673c724a6b757b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "6e2b38e0faf64f529e849f37ad4d4eab": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HBoxView",
+            "_dom_classes": [],
+            "_model_name": "HBoxModel",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "box_style": "",
+            "layout": "IPY_MODEL_ba6e9cadc3274a64ae0738fb99c55860",
+            "_model_module": "@jupyter-widgets/controls",
+            "children": [
+              "IPY_MODEL_62edc38a7caf4feca6db6de2d767fbb3",
+              "IPY_MODEL_155616f4494440b291ae4722aaeca750",
+              "IPY_MODEL_2045670c97194bd68a398213552d8364"
+            ]
+          }
+        },
+        "ba6e9cadc3274a64ae0738fb99c55860": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "62edc38a7caf4feca6db6de2d767fbb3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_1ca831c613424bea8f42c4a89f44b2b5",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": "  0%",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_b352385cb0664a8b873f5028a728c4e3"
+          }
+        },
+        "155616f4494440b291ae4722aaeca750": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "ProgressView",
+            "style": "IPY_MODEL_085e99b8131f4f8bb65f2ed723a21b48",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "FloatProgressModel",
+            "bar_style": "",
+            "max": 1000000,
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": 32,
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "orientation": "horizontal",
+            "min": 0,
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_d1ec06e6abca4b35b8e22856bcc9c5ce"
+          }
+        },
+        "2045670c97194bd68a398213552d8364": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "HTMLView",
+            "style": "IPY_MODEL_565f3cec67374748bdc8baf19f18963f",
+            "_dom_classes": [],
+            "description": "",
+            "_model_name": "HTMLModel",
+            "placeholder": "​",
+            "_view_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "value": " 32/1000000 [03:13&lt;1383:55:47,  4.98s/it]",
+            "_view_count": null,
+            "_view_module_version": "1.5.0",
+            "description_tooltip": null,
+            "_model_module": "@jupyter-widgets/controls",
+            "layout": "IPY_MODEL_358cb048c5a04a7cb2e9de91017baf01"
+          }
+        },
+        "1ca831c613424bea8f42c4a89f44b2b5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "b352385cb0664a8b873f5028a728c4e3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "085e99b8131f4f8bb65f2ed723a21b48": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "ProgressStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "bar_color": null,
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "d1ec06e6abca4b35b8e22856bcc9c5ce": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        },
+        "565f3cec67374748bdc8baf19f18963f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_view_name": "StyleView",
+            "_model_name": "DescriptionStyleModel",
+            "description_width": "",
+            "_view_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.5.0",
+            "_view_count": null,
+            "_view_module_version": "1.2.0",
+            "_model_module": "@jupyter-widgets/controls"
+          }
+        },
+        "358cb048c5a04a7cb2e9de91017baf01": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_view_name": "LayoutView",
+            "grid_template_rows": null,
+            "right": null,
+            "justify_content": null,
+            "_view_module": "@jupyter-widgets/base",
+            "overflow": null,
+            "_model_module_version": "1.2.0",
+            "_view_count": null,
+            "flex_flow": null,
+            "width": null,
+            "min_width": null,
+            "border": null,
+            "align_items": null,
+            "bottom": null,
+            "_model_module": "@jupyter-widgets/base",
+            "top": null,
+            "grid_column": null,
+            "overflow_y": null,
+            "overflow_x": null,
+            "grid_auto_flow": null,
+            "grid_area": null,
+            "grid_template_columns": null,
+            "flex": null,
+            "_model_name": "LayoutModel",
+            "justify_items": null,
+            "grid_row": null,
+            "max_height": null,
+            "align_content": null,
+            "visibility": null,
+            "align_self": null,
+            "height": null,
+            "min_height": null,
+            "padding": null,
+            "grid_auto_rows": null,
+            "grid_gap": null,
+            "max_width": null,
+            "order": null,
+            "_view_module_version": "1.2.0",
+            "grid_template_areas": null,
+            "object_position": null,
+            "object_fit": null,
+            "grid_auto_columns": null,
+            "margin": null,
+            "display": null,
+            "left": null
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/gist/justheuristic/75f6a2a731f05a213a55cd2c8a458aaf/fine-tune-a-language-model-with-dataset-streaming-and-8-bit-optimizers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Kw6MQx4xFjgy"
+      },
+      "source": [
+        "__This notebook__ explains how to fine-tune GPT-2 Large on a large dataset in colab or on your home computer.\n",
+        "\n",
+        "To fit this task into a colab instance, we will use two tricks:\n",
+        "* streaming the [C4 dataset](https://huggingface.co/datasets/c4) using [`datasets` Streaming API](https://huggingface.co/docs/datasets/dataset_streaming.html). Without that, C4 would take up over 300GB of disk space.\n",
+        "* training with 8-Bit Adam from the [`bitsandbytes` library](https://github.com/facebookresearch/bitsandbytes). Without 8-bit compression, training GPT-2 Large would not fit in GPU memory.\n",
+        "\n",
+        "\n",
+        "\n",
+        "This notebook is based on the [\"fine-tune a language model\"](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb) tutorial by [Sylvain Gugger](https://sgugger.github.io/pages/about-me.html#about-me) as well as the [pytorch language-model example](https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-E5Z6CLC6UfJ"
+      },
+      "source": [
+        "# Installation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "X4cRE8IbIrIV"
+      },
+      "source": [
+        "If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "MOsHUjgdIrIW",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "4940dd79-5eed-4eaa-8ac4-55d6ee49a454"
+      },
+      "source": [
+        " ! pip install datasets transformers"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting datasets\n",
+            "  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)\n",
+            "\u001b[K     |████████████████████████████████| 298 kB 5.3 MB/s \n",
+            "\u001b[?25hCollecting transformers\n",
+            "  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)\n",
+            "\u001b[K     |████████████████████████████████| 3.1 MB 34.1 MB/s \n",
+            "\u001b[?25hCollecting xxhash\n",
+            "  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)\n",
+            "\u001b[K     |████████████████████████████████| 243 kB 52.1 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.1.5)\n",
+            "Collecting fsspec[http]>=2021.05.0\n",
+            "  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)\n",
+            "\u001b[K     |████████████████████████████████| 132 kB 38.4 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n",
+            "Requirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (3.0.0)\n",
+            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.8.2)\n",
+            "Collecting aiohttp\n",
+            "  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1 MB 36.3 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.19.5)\n",
+            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.62.3)\n",
+            "Collecting huggingface-hub<1.0.0,>=0.1.0\n",
+            "  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n",
+            "\u001b[K     |████████████████████████████████| 61 kB 395 kB/s \n",
+            "\u001b[?25hRequirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.10.0.2)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.4.0)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.13)\n",
+            "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.6)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
+            "Collecting tokenizers<0.11,>=0.10.1\n",
+            "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
+            "\u001b[K     |████████████████████████████████| 3.3 MB 33.4 MB/s \n",
+            "\u001b[?25hCollecting pyyaml\n",
+            "  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
+            "\u001b[K     |████████████████████████████████| 596 kB 35.6 MB/s \n",
+            "\u001b[?25hCollecting sacremoses\n",
+            "  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
+            "\u001b[K     |████████████████████████████████| 895 kB 42.3 MB/s \n",
+            "\u001b[?25hCollecting aiosignal>=1.1.2\n",
+            "  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.2.0)\n",
+            "Collecting multidict<7.0,>=4.5\n",
+            "  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)\n",
+            "\u001b[K     |████████████████████████████████| 160 kB 49.5 MB/s \n",
+            "\u001b[?25hCollecting frozenlist>=1.1.1\n",
+            "  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)\n",
+            "\u001b[K     |████████████████████████████████| 192 kB 52.2 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.8)\n",
+            "Collecting yarl<2.0,>=1.0\n",
+            "  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
+            "\u001b[K     |████████████████████████████████| 271 kB 55.7 MB/s \n",
+            "\u001b[?25hCollecting asynctest==0.13.0\n",
+            "  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n",
+            "Collecting async-timeout<5.0,>=4.0.0a3\n",
+            "  Downloading async_timeout-4.0.1-py3-none-any.whl (5.7 kB)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.6.0)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2018.9)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n",
+            "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, pyyaml, fsspec, aiohttp, xxhash, tokenizers, sacremoses, huggingface-hub, transformers, datasets\n",
+            "  Attempting uninstall: pyyaml\n",
+            "    Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.1 asynctest-0.13.0 datasets-1.16.1 frozenlist-1.2.0 fsspec-2021.11.1 huggingface-hub-0.2.1 multidict-5.2.0 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.12.5 xxhash-2.0.2 yarl-1.7.2\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Aewv5KT-JnHz"
+      },
+      "source": [
+        "\n",
+        "We are also installing bitsandbytes which depends on the CUDA version run by your colab. The installed CUDA version is displayed in the top right when calling nvidia-smi. Use this version to install the right bitsandbytes version below. We need a GPU for this, so if you have not yet a GPU loaded use Runtime-> Change runtime type -> select GPU from the dropdown."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XprTmlZuJpXL",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "c7343f62-43ef-41f2-972d-6285793fecba"
+      },
+      "source": [
+        "! nvidia-smi\n",
+        "! pip install bitsandbytes-cuda112"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Sat Dec  4 19:08:21 2021       \n",
+            "+-----------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
+            "|-------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                               |                      |               MIG M. |\n",
+            "|===============================+======================+======================|\n",
+            "|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   36C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |\n",
+            "|                               |                      |                  N/A |\n",
+            "+-------------------------------+----------------------+----------------------+\n",
+            "                                                                               \n",
+            "+-----------------------------------------------------------------------------+\n",
+            "| Processes:                                                                  |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
+            "|        ID   ID                                                   Usage      |\n",
+            "|=============================================================================|\n",
+            "|  No running processes found                                                 |\n",
+            "+-----------------------------------------------------------------------------+\n",
+            "Collecting bitsandbytes-cuda112\n",
+            "  Downloading bitsandbytes_cuda112-0.26.0-py3-none-any.whl (4.2 MB)\n",
+            "\u001b[K     |████████████████████████████████| 4.2 MB 5.4 MB/s \n",
+            "\u001b[?25hInstalling collected packages: bitsandbytes-cuda112\n",
+            "Successfully installed bitsandbytes-cuda112-0.26.0\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZyC8zpEqGddx"
+      },
+      "source": [
+        "To test the bitsandbytes installation we can run a simple update with 8-bit Adam."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "A8HODqE2E_6N",
+        "outputId": "e1888813-c491-4d02-baa9-7e0e312d2a36"
+      },
+      "source": [
+        "import bitsandbytes as bnb\n",
+        "import torch\n",
+        "\n",
+        "p = torch.nn.Parameter(torch.rand(10,10).cuda())\n",
+        "a = torch.rand(10,10).cuda()\n",
+        "\n",
+        "p1 = p.data.sum().item()\n",
+        "\n",
+        "adam = bnb.optim.Adam8bit([p])\n",
+        "\n",
+        "out = a*p\n",
+        "loss = out.sum()\n",
+        "loss.backward()\n",
+        "adam.step()\n",
+        "\n",
+        "p2 = p.data.sum().item()\n",
+        "\n",
+        "assert p1 != p2\n",
+        "print('SUCCESS!')\n",
+        "print('Installation was successful!')"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "SUCCESS!\n",
+            "Installation was successful!\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GDkUr_zQBoZR"
+      },
+      "source": [
+        "# Dataset Streaming"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "i4UPzShfBrgr"
+      },
+      "source": [
+        "Pre-training often requires a huge text dataset. Some famous datasets for pre-training are [C4](https://huggingface.co/datasets/c4), its multilingual version [mC4](https://huggingface.co/datasets/mc4), as well as [OSCAR](https://huggingface.co/datasets/oscar).\n",
+        "\n",
+        "These datasets can be terabytes of data and require a lot of resources:\n",
+        "- a good bandwidth to download all the data\n",
+        "- terabytes of disk space to store the data\n",
+        "- dozens of CPUs and a good infrastructure to tokenize the text dataset\n",
+        "- lots of time to wait for the tokenization to happen\n",
+        "\n",
+        "This makes it very impractical to get your hands on a dataset for pretraining when you have limited resources."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6xabIliEFxyo"
+      },
+      "source": [
+        "Dataset streaming is the solution in this case. Streaming allows to simply have access to the very small subset of the dataset that you need at any time during training. Text samples are progressively downloaded during training, and processed on-the-fly.\n",
+        "\n",
+        "Thanks to dataset streaming:\n",
+        "- training can start directly without waiting for terabytes of data to be downloaded\n",
+        "- you can use an arbitrarily large dataset, without being constrained by your disk space\n",
+        "- you can process the batches of text as they arrive with a regular CPU\n",
+        "- you don't waste time processing examples that are not immediately needed for training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9JIEFS8nISgn"
+      },
+      "source": [
+        "Dataset streaming is available in [Hugging Face Datasets](https://github.com/huggingface/datasets) - you simply need to pass `streaming=True` when loading a dataset, and it can be used with a PyTorch data loader.\n",
+        "\n",
+        "![dataset streaming](https://huggingface.co/docs/datasets/_images/stream.gif \"Dataset Streaming\")\n",
+        "\n",
+        "Hugging face Datasets also allows you to process examples on-the-fly via `.map()` and shuffle the dataset with `.shuffle()`.\n",
+        "\n",
+        "Here is an example on how to load the C4 dataset in streaming mode:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 183,
+          "referenced_widgets": [
+            "372609dca95b4ddcb51491283df860f5",
+            "e0b881dd26d54c7c92ba9ab5923fab10",
+            "90eb62f7ec634e098db511a5995d807f",
+            "d60a799761f649378d17a044362e55b9",
+            "a76cf6149c6748a5aa74ada58921d31e",
+            "5256dfd69e364597a29b2ad61c01ea93",
+            "1a231c5cffbb4225941d02cb5e3bb273",
+            "cffbeabee69446c48dfa89ac38d9f45e",
+            "0c02af4a252e40fba08c097f59926dc8",
+            "2917b5df9cc14ec3a2fe356c12c8511e",
+            "5c3abd7a7f354ac0b1cd3d89506f417a",
+            "8714bb6e944345b98b691f40adf0bf76",
+            "a2983efe78b94919891d6db909100934",
+            "b61fb9c3745f4c468954713b07f8f16f",
+            "5f05edadc8c943aa82a790e815253a48",
+            "867c7dd23eb64a1b8d02a7cf8a4ad64a",
+            "01445fea254a436fa464b6006f3abd92",
+            "74c4d6d598bf4173999a56fa849506d3",
+            "e299d23c03444805b0a359eb049cddb3",
+            "c2f4080d692a46debfb5a8000d2637d6",
+            "b0cbc589d149494cbb74139bedb0aafc",
+            "b3fec901d0cc48a2862b900928b681f3"
+          ]
+        },
+        "id": "N4E5ZT6rJnm7",
+        "outputId": "f7c35c58-83c5-4cd4-8032-059c6da674a3"
+      },
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "c4 = load_dataset(\"c4\", \"en\", streaming=True)\n",
+        "\n",
+        "# Let's print a few examples\n",
+        "for i, example in enumerate(c4[\"train\"]):\n",
+        "    print(f\"{i}: {str(example)[:200]}...\")\n",
+        "    if i == 5:\n",
+        "        break\n"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "372609dca95b4ddcb51491283df860f5",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "8714bb6e944345b98b691f40adf0bf76",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/492k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "0: {'text': 'Beginners BBQ Class Taking Place in Missoula!\\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join ...\n",
+            "1: {'text': 'Discussion in \\'Mac OS X Lion (10.7)\\' started by axboi87, Jan 20, 2012.\\nI\\'ve got a 500gb internal drive and a 240gb SSD.\\nWhen trying to restore using disk utility i\\'m given the error \"N...\n",
+            "2: {'text': 'Foil plaid lycra and spandex shortall with metallic slinky insets. Attached metallic elastic belt with O-ring. Headband included. Great hip hop or jazz dance costume. Made in the USA.', 'tim...\n",
+            "3: {'text': \"How many backlinks per day for new site?\\nDiscussion in 'Black Hat SEO' started by Omoplata, Dec 3, 2010.\\n1) for a newly created site, what's the max # backlinks per day I should do to be s...\n",
+            "4: {'text': 'The Denver Board of Education opened the 2017-18 school year with an update on projects that include new construction, upgrades, heat mitigation and quality learning environments.\\nWe are ex...\n",
+            "5: {'text': 'BANGALORE CY JUNCTION SBC to GONDIA JUNCTION G train timings, routes, stops, and complete info.\\nAs of now, 1 trains run between from BANGALORE CY JUNCTION (YPR) to GONDIA JUNCTION (G).\\nThe...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "a3KD3WXU3l-O"
+      },
+      "source": [
+        "# Fine-tuning a language model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qJzozNPFKc2x"
+      },
+      "source": [
+        "We are fine-tuning without Trainer so we can specify dataset streaming and 8-bit optimizers. For that, we copy the main parts of the [example script for language modeling](https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py). First we import boiler-plate and the go step-by-step throught the model setup and training."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "np2mr2Le6oqy"
+      },
+      "source": [
+        "## Preprocessing & model setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ET8K4pK1Kq7O"
+      },
+      "source": [
+        "# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.\n",
+        "\n",
+        "import argparse\n",
+        "import logging\n",
+        "import math\n",
+        "import os\n",
+        "import random\n",
+        "from itertools import chain\n",
+        "from pathlib import Path\n",
+        "\n",
+        "import datasets\n",
+        "import torch\n",
+        "from datasets import load_dataset\n",
+        "from torch.utils.data import DataLoader\n",
+        "from tqdm.auto import tqdm\n",
+        "\n",
+        "import transformers\n",
+        "from huggingface_hub import Repository\n",
+        "from transformers import (\n",
+        "    CONFIG_MAPPING,\n",
+        "    MODEL_MAPPING,\n",
+        "    AdamW,\n",
+        "    AutoConfig,\n",
+        "    AutoModelForCausalLM,\n",
+        "    AutoTokenizer,\n",
+        "    SchedulerType,\n",
+        "    default_data_collator,\n",
+        "    get_scheduler,\n",
+        "    set_seed,\n",
+        ")\n",
+        "from transformers.file_utils import get_full_repo_name\n",
+        "from transformers.utils.versions import require_version\n",
+        "\n",
+        "\n",
+        "logger = logging.getLogger(__name__)\n",
+        "\n",
+        "require_version(\"datasets>=1.16.1\", \"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt\")\n",
+        "\n",
+        "MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())\n",
+        "MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)\n",
+        "\n",
+        "\n",
+        "def parse_args():\n",
+        "    parser = argparse.ArgumentParser(description=\"Finetune a transformers model on a causal language modeling task\")\n",
+        "    parser.add_argument(\n",
+        "        \"--dataset_name\",\n",
+        "        type=str,\n",
+        "        default=None,\n",
+        "        help=\"The name of the dataset to use (via the datasets library).\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--dataset_config_name\",\n",
+        "        type=str,\n",
+        "        default=None,\n",
+        "        help=\"The configuration name of the dataset to use (via the datasets library).\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--text_column_name\",\n",
+        "        type=str,\n",
+        "        default=None,\n",
+        "        help=\"The name of the column containing the text data.\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--dataset_streaming\",\n",
+        "        action=\"store_true\",\n",
+        "        help=\"If passed, will use dataset streaming (via the datasets library)\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--model_name_or_path\",\n",
+        "        type=str,\n",
+        "        help=\"Path to pretrained model or model identifier from huggingface.co/models.\",\n",
+        "        required=False,\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--config_name\",\n",
+        "        type=str,\n",
+        "        default=None,\n",
+        "        help=\"Pretrained config name or path if not the same as model_name\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--tokenizer_name\",\n",
+        "        type=str,\n",
+        "        default=None,\n",
+        "        help=\"Pretrained tokenizer name or path if not the same as model_name\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--use_slow_tokenizer\",\n",
+        "        action=\"store_true\",\n",
+        "        help=\"If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--per_device_train_batch_size\",\n",
+        "        type=int,\n",
+        "        default=1,\n",
+        "        help=\"Batch size (per device) for the training dataloader.\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--learning_rate\",\n",
+        "        type=float,\n",
+        "        default=5e-5,\n",
+        "        help=\"Initial learning rate (after the potential warmup period) to use.\",\n",
+        "    )\n",
+        "    parser.add_argument(\"--weight_decay\", type=float, default=0.0, help=\"Weight decay to use.\")\n",
+        "    parser.add_argument(\"--num_train_epochs\", type=int, default=1, help=\"Total number of training epochs to perform.\")\n",
+        "    parser.add_argument(\n",
+        "        \"--max_train_steps\",\n",
+        "        type=int,\n",
+        "        default=None,\n",
+        "        help=\"Total number of training steps to perform. If provided, overrides num_train_epochs.\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--gradient_accumulation_steps\",\n",
+        "        type=int,\n",
+        "        default=1,\n",
+        "        help=\"Number of updates steps to accumulate before performing a backward/update pass.\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--lr_scheduler_type\",\n",
+        "        type=SchedulerType,\n",
+        "        default=\"linear\",\n",
+        "        help=\"The scheduler type to use.\",\n",
+        "        choices=[\"linear\", \"cosine\", \"cosine_with_restarts\", \"polynomial\", \"constant\", \"constant_with_warmup\"],\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--num_warmup_steps\", type=int, default=3000, help=\"Number of steps for the warmup in the lr scheduler.\"\n",
+        "    )\n",
+        "    parser.add_argument(\"--output_dir\", type=str, default=None, help=\"Where to store the final model.\")\n",
+        "    parser.add_argument(\"--seed\", type=int, default=None, help=\"A seed for reproducible training.\")\n",
+        "    parser.add_argument(\n",
+        "        \"--model_type\",\n",
+        "        type=str,\n",
+        "        default=None,\n",
+        "        help=\"Model type to use if training from scratch.\",\n",
+        "        choices=MODEL_TYPES,\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--block_size\",\n",
+        "        type=int,\n",
+        "        default=None,\n",
+        "        help=\"Optional input sequence length after tokenization. The training dataset will be truncated in block of this size for training. Default to the model max input length for single sentence inputs (take into account special tokens).\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--preprocessing_num_workers\",\n",
+        "        type=int,\n",
+        "        default=None,\n",
+        "        help=\"The number of processes to use for the preprocessing.\",\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--overwrite_cache\", type=bool, default=False, help=\"Overwrite the cached training and evaluation sets\"\n",
+        "    )\n",
+        "    parser.add_argument(\n",
+        "        \"--no_keep_linebreaks\", action=\"store_true\", help=\"Do not keep line breaks when using TXT files.\"\n",
+        "    )\n",
+        "    parser.add_argument(\"--push_to_hub\", action=\"store_true\", help=\"Whether or not to push the model to the Hub.\")\n",
+        "    parser.add_argument(\n",
+        "        \"--hub_model_id\", type=str, help=\"The name of the repository to keep in sync with the local `output_dir`.\"\n",
+        "    )\n",
+        "    parser.add_argument(\"--hub_token\", type=str, help=\"The token to use to push to the Model Hub.\")\n",
+        "    args = parser.parse_args(args=[])\n",
+        "\n",
+        "    if args.push_to_hub:\n",
+        "        assert args.output_dir is not None, \"Need an `output_dir` to create a repo when `--push_to_hub` is passed.\"\n",
+        "\n",
+        "    return args\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4QqgoTBcLzqq"
+      },
+      "source": [
+        "We setup the streaming dataset, the tokenizer, and the model (GPT-2 medium)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000,
+          "referenced_widgets": [
+            "619629ebf5fc4e3ba9ad49e7e767a37d",
+            "87a5f7bca18e4a818d6d48e15bc68845",
+            "75d276bd972d429483af64d1eb27624c",
+            "6f778a9c54f042199869eb16563bb933",
+            "275f3cf6de1f49c1ae1f09991b4ea99c",
+            "670d440077b34a8e86bf2d620ab6fb6d",
+            "ac7455b1e5e5475cabf4dab505603a43",
+            "4e82c37304d446d88e852553cbe9acb9",
+            "213fc01fa39b4b77ba211a9304c5ea86",
+            "a9dae405fbce41e3a46b05292313bab0",
+            "272807a127d54f5ea3f20c0bc7262a25",
+            "1ce665de582c4bd392d6bc4fff9a1499",
+            "36cdf4ebc5684e9e88d80d5c98d86154",
+            "288d1670c87e469ba3fe7c0030887d19",
+            "eeb0bf36fb93410f82ab96b5414b6c50",
+            "3c8d0050e0904acf933d18316000ac8a",
+            "3be14001505b4dd19293365cb4a52cbc",
+            "4195ef4de4ba491995e28b72e4d82a5b",
+            "cafc6a76a9ac4de79b46969ebab90265",
+            "9e6e0d9075454e17932e147678c1cef6",
+            "7c83e50b67374fa5a2d713c036ba8e84",
+            "0e0d370b48c84349a7f7cc45ed5f9d09",
+            "2222296ad8d14cd39e8af17c108ad5ed",
+            "e39daeca11584766a42f9bc6df76089c",
+            "8fad1d6738b84704a3b4d248dbcfac2b",
+            "405c42d0df19419e8563b00a3f402ae9",
+            "7e2ceabdd41446cc860e35f94e02de31",
+            "5b2f0da9535f46d78999e11162e16666",
+            "d47fc8e1a9a84efcb6d9a02ba973e940",
+            "a127db6cdeae4700834c1dc582ecb609",
+            "b9a0b5fe13134774a7560a583513c7cd",
+            "66d58d9dab054294a81601572b143c00",
+            "75f479bbe938484891214f27986364eb",
+            "05c7977a9322499bbc00e80f0d767bee",
+            "9a76a0ee943343d781caf5b30ce0c6a5",
+            "f454951d36c74e13bc3cf04b41388345",
+            "1974ea03fa42428fa162eac65f8d71e4",
+            "96cdea5bf8ba44f3a37e5e1f70cbef62",
+            "221d95935ae249bb8fbf60007ed39e82",
+            "0a5ca2e268c24ae1a9e42d5c9dcadb29",
+            "997827c35e02494d82209fa5f1232e5f",
+            "aaf3281319a94b42ba21941f41d262bc",
+            "ffa4cf74c0c84e0797de60e50e1f579c",
+            "d60228cfbd9f44ddbcefb20a74d99779",
+            "8bff97fa5703438685342e1b140234f5",
+            "542ccbd2abf941fea5312b72c51ec5b6",
+            "f71eb8c60e4b4ef6b44b9f6b90bf25c5",
+            "1355930a0edf40e5ac75198e9d901bfb",
+            "ce047a309f984458859ddfc15aad1125",
+            "c82b029db5a547f09d1b9607a5d26ecc",
+            "01312ccfa1254c7a8acc6a7410af711a",
+            "7db4a94408b94d71ab9f6fccded0f011",
+            "356eaaf643894327a0ba30049c3a2b92",
+            "874af3844342411096d6d8872354f531",
+            "fd8cb9272f3d4ff9ae673c724a6b757b"
+          ]
+        },
+        "id": "PSzJDsewLPEi",
+        "outputId": "f956fc65-1210-4db5-a2ea-fc482f357eb2"
+      },
+      "source": [
+        "args = parse_args() # get default arguments\n",
+        "\n",
+        "# If passed along, set the training seed now.\n",
+        "if args.seed is not None:\n",
+        "    set_seed(args.seed)\n",
+        "\n",
+        "args.dataset_name = 'c4'\n",
+        "args.dataset_streaming = True\n",
+        "args.text_column_name = \"text\"\n",
+        "args.model_name_or_path = 'gpt2-large'\n",
+        "args.dataset_config_name = \"en\"\n",
+        "args.block_size = 1024\n",
+        "args.max_train_steps = 1_000_000\n",
+        "args.log_loss_interval = 25\n",
+        "\n",
+        "\n",
+        "# LOAD DATA\n",
+        "raw_train_dataset = load_dataset(args.dataset_name, args.dataset_config_name, streaming=args.dataset_streaming, split=\"train\")\n",
+        "\n",
+        "if args.config_name:\n",
+        "    config = AutoConfig.from_pretrained(args.config_name)\n",
+        "elif args.model_name_or_path:\n",
+        "    config = AutoConfig.from_pretrained(args.model_name_or_path)\n",
+        "else:\n",
+        "    config = CONFIG_MAPPING[args.model_type]()\n",
+        "    logger.warning(\"You are instantiating a new config instance from scratch.\")\n",
+        "\n",
+        "# TOKENIZER\n",
+        "if args.tokenizer_name:\n",
+        "        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)\n",
+        "elif args.model_name_or_path:\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)\n",
+        "else:\n",
+        "    raise ValueError(\n",
+        "        \"You are instantiating a new tokenizer from scratch. This is not supported by this script.\"\n",
+        "        \"You can do it from another script, save it, and load it from here, using --tokenizer_name.\"\n",
+        "    )\n",
+        "\n",
+        "if args.model_name_or_path:\n",
+        "    model = AutoModelForCausalLM.from_pretrained(\n",
+        "        args.model_name_or_path,\n",
+        "        from_tf=bool(\".ckpt\" in args.model_name_or_path),\n",
+        "        config=config,\n",
+        "    )\n",
+        "else:\n",
+        "    logger.info(\"Training new model from scratch\")\n",
+        "    model = AutoModelForCausalLM.from_config(config)\n",
+        "\n",
+        "model.resize_token_embeddings(len(tokenizer))\n",
+        "\n",
+        "model.gradient_checkpointing_enable()\n",
+        "model.cuda() # send model to cuda preemptively to free RAM. Yep, we're using GPU as an offload memory. O tempora! O mores!"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "619629ebf5fc4e3ba9ad49e7e767a37d",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "1ce665de582c4bd392d6bc4fff9a1499",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "2222296ad8d14cd39e8af17c108ad5ed",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "05c7977a9322499bbc00e80f0d767bee",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "8bff97fa5703438685342e1b140234f5",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "Downloading:   0%|          | 0.00/3.02G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "GPT2LMHeadModel(\n",
+              "  (transformer): GPT2Model(\n",
+              "    (wte): Embedding(50257, 1280)\n",
+              "    (wpe): Embedding(1024, 1280)\n",
+              "    (drop): Dropout(p=0.1, inplace=False)\n",
+              "    (h): ModuleList(\n",
+              "      (0): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (1): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (2): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (3): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (4): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (5): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (6): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (7): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (8): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (9): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (10): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (11): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (12): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (13): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (14): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (15): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (16): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (17): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (18): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (19): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (20): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (21): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (22): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (23): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (24): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (25): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (26): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (27): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (28): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (29): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (30): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (31): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (32): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (33): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (34): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "      (35): GPT2Block(\n",
+              "        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (attn): GPT2Attention(\n",
+              "          (c_attn): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+              "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "        (mlp): GPT2MLP(\n",
+              "          (c_fc): Conv1D()\n",
+              "          (c_proj): Conv1D()\n",
+              "          (dropout): Dropout(p=0.1, inplace=False)\n",
+              "        )\n",
+              "      )\n",
+              "    )\n",
+              "    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+              "  )\n",
+              "  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)\n",
+              ")"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 6
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B2zzrdxL-_-3"
+      },
+      "source": [
+        "We now preprocess the dataset into blocks of the specified size (768)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ThKeDte__C3A"
+      },
+      "source": [
+        "text_column_name = args.text_column_name\n",
+        "\n",
+        "def tokenize_function(examples):\n",
+        "    return tokenizer(examples[text_column_name])\n",
+        "\n",
+        "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n",
+        "def group_texts(examples):\n",
+        "    # Concatenate all texts.\n",
+        "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
+        "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+        "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+        "    # customize this part to your needs.\n",
+        "    if total_length >= block_size:\n",
+        "        total_length = (total_length // block_size) * block_size\n",
+        "    # Split by chunks of max_len.\n",
+        "    result = {\n",
+        "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+        "        for k, t in concatenated_examples.items()\n",
+        "    }\n",
+        "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+        "    return result\n",
+        "\n",
+        "tokenized_train_dataset = raw_train_dataset.shuffle(10_000, seed=42).map(tokenize_function, batched=True)\n",
+        "\n",
+        "if args.block_size is None:\n",
+        "    block_size = tokenizer.model_max_length\n",
+        "    if block_size > 1024:\n",
+        "        logger.warning(\n",
+        "            f\"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). \"\n",
+        "            \"Picking 1024 instead. You can change that default value by passing --block_size xxx.\"\n",
+        "        )\n",
+        "    block_size = 1024\n",
+        "else:\n",
+        "    if args.block_size > tokenizer.model_max_length:\n",
+        "        logger.warning(\n",
+        "            f\"The block_size passed ({args.block_size}) is larger than the maximum length for the model\"\n",
+        "            f\"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}.\"\n",
+        "        )\n",
+        "    block_size = min(args.block_size, tokenizer.model_max_length)\n",
+        "\n",
+        "# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder\n",
+        "# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower\n",
+        "# to preprocess.\n",
+        "train_dataset = tokenized_train_dataset.map(group_texts, batched=True)\n",
+        "train_dataset = train_dataset.shuffle(10_000, seed=42).with_format(\"torch\")\n",
+        "\n",
+        "# DataLoaders creation:\n",
+        "train_dataloader = DataLoader(\n",
+        "    train_dataset, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size\n",
+        ")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E0Wl7_Vx8HiP"
+      },
+      "source": [
+        "## 8-bit Optimizers"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "l3Z-4DmQ_etm"
+      },
+      "source": [
+        "\n",
+        "In this example, we fine-tune GPT-2 medium with a sequence dimension of 768 which runs out of memory. How can we fit this model on a colab GPU with 12 GB of memory? One solution is to use 8-bit optimizers.\n",
+        "\n",
+        "8-bit opitimizers decrease the memory footprint for training models by compressing and storing the optimizer statistics for optimizers. For Adam, there are two optimizer buffers, one for an estimate of the running mean of the gradient and one for the standard deviation. Each of the buffers has the size of the full model, as such, the Adam optimizers uses 2x more memory than the model itself. With 8-bit optimizers we reduce that from 32-bit to 8-bit thus reducing the memory due to Adam from 2x the model size to 0.5x the model size -- a reduction by 75%.\n",
+        "\n",
+        "8-bit optimizers work by using dynamic quantization and block-wise quantization to ensure stable training and the same performance as 32-bit optimizers while achieving the 75% reduction in memory.\n",
+        "\n",
+        "8-bit optimizers work as follows\n",
+        "1. Chunk optimizer states into blocks\n",
+        "2. Normalize each block into the range [-1, 1] by dividing by the absmax of the block\n",
+        "3. Perform dynamic quantization\n",
+        "4. Store 8-bit data\n",
+        "\n",
+        "For dequantization we reverse these steps. These steps are demonstrated by the example below:\n",
+        "\n",
+        " ![Schematic of 8-bit optimizers](https://timdettmers.com/wp-content/uploads/2021/12/schematic2.svg)\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tpoSBTsR_n7n"
+      },
+      "source": [
+        "import bitsandbytes as bnb\n",
+        "#optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) # this crashes with out-of-memory error\n",
+        "optimizer = bnb.optim.Adam8bit(model.parameters(), lr=args.learning_rate)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "q0MHy1x07_Ac"
+      },
+      "source": [
+        "## Train the model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4pOO1F7b_-f9"
+      },
+      "source": [
+        "Now we are training the model with dataset streaming and 8-bit optimizers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 909,
+          "referenced_widgets": [
+            "6e2b38e0faf64f529e849f37ad4d4eab",
+            "ba6e9cadc3274a64ae0738fb99c55860",
+            "62edc38a7caf4feca6db6de2d767fbb3",
+            "155616f4494440b291ae4722aaeca750",
+            "2045670c97194bd68a398213552d8364",
+            "1ca831c613424bea8f42c4a89f44b2b5",
+            "b352385cb0664a8b873f5028a728c4e3",
+            "085e99b8131f4f8bb65f2ed723a21b48",
+            "d1ec06e6abca4b35b8e22856bcc9c5ce",
+            "565f3cec67374748bdc8baf19f18963f",
+            "358cb048c5a04a7cb2e9de91017baf01"
+          ]
+        },
+        "id": "ONzFs98p__5M",
+        "outputId": "ac4fb040-a3f3-41aa-cf8d-9699e621faad"
+      },
+      "source": [
+        "lr_scheduler = get_scheduler(\n",
+        "    name=args.lr_scheduler_type,\n",
+        "    optimizer=optimizer,\n",
+        "    num_warmup_steps=args.num_warmup_steps,\n",
+        "    num_training_steps=args.max_train_steps,\n",
+        ")\n",
+        "# Train!\n",
+        "total_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps\n",
+        "# Only show the progress bar once on each machine.\n",
+        "progress_bar = tqdm(range(args.max_train_steps), disable=False)\n",
+        "completed_steps = 0\n",
+        "\n",
+        "def get_free_mem():\n",
+        "    t = torch.cuda.get_device_properties(0).total_memory\n",
+        "    r = torch.cuda.memory_reserved(0)\n",
+        "    a = torch.cuda.memory_allocated(0)\n",
+        "    f = r - a\n",
+        "    return f/1024**3, r/1024**3, a/1024**3\n",
+        "\n",
+        "for epoch in range(args.num_train_epochs):\n",
+        "    model.train()\n",
+        "    losses = []\n",
+        "    for step, batch in enumerate(train_dataloader):\n",
+        "        gpu_data = {}\n",
+        "        for key, value in batch.items():\n",
+        "            gpu_data[key] = value.cuda()\n",
+        "\n",
+        "        outputs = model(**gpu_data, use_cache=False)\n",
+        "        loss = outputs.loss\n",
+        "        losses.append(loss.item())\n",
+        "        loss = loss / args.gradient_accumulation_steps\n",
+        "        loss.backward()\n",
+        "        if step % args.gradient_accumulation_steps == 0 or step == args.max_train_steps:\n",
+        "            optimizer.step()\n",
+        "            lr_scheduler.step()\n",
+        "            optimizer.zero_grad()\n",
+        "            progress_bar.update(1)\n",
+        "            completed_steps += 1\n",
+        "\n",
+        "        if step % args.log_loss_interval == 0 and step > 0:\n",
+        "            try:\n",
+        "                perplexity = math.exp(sum(losses)/len(losses))\n",
+        "            except OverflowError:\n",
+        "                perplexity = float(\"inf\")\n",
+        "            losses = []\n",
+        "            print(f\"epoch: {epoch+1}, step: {step}, perplexity: {perplexity}\")\n",
+        "            \n",
+        "\n",
+        "        if completed_steps >= args.max_train_steps:\n",
+        "            break\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "6e2b38e0faf64f529e849f37ad4d4eab",
+              "version_minor": 0,
+              "version_major": 2
+            },
+            "text/plain": [
+              "  0%|          | 0/1000000 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Token indices sequence length is longer than the specified maximum sequence length for this model (11128 > 1024). Running this sequence through the model will result in indexing errors\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "epoch: 1, step: 25, perplexity: 21.84923336079448\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+          ]
+        }
+      ]
+    }
+  ]
+}