Vivek
/

gpt2-common-sense-reasoning

@@ -1,1074 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "GPT2(error).ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "1b266a2c1cf646a392a46e39586282b3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HBoxModel",
-          "state": {
-            "_view_name": "HBoxView",
-            "_dom_classes": [],
-            "_model_name": "HBoxModel",
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "box_style": "",
-            "layout": "IPY_MODEL_8ecfcf14981c4d82b5d9d3839a496f0b",
-            "_model_module": "@jupyter-widgets/controls",
-            "children": [
-              "IPY_MODEL_16b07572ac0d46798b2c2a292c3f9143",
-              "IPY_MODEL_cf412ff73fc647908154abc9b2847f38"
-            ]
-          }
-        },
-        "8ecfcf14981c4d82b5d9d3839a496f0b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "16b07572ac0d46798b2c2a292c3f9143": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_view_name": "ProgressView",
-            "style": "IPY_MODEL_6ebc21286ae843e5b9ba4df8f4cebfe0",
-            "_dom_classes": [],
-            "description": "Downloading: 100%",
-            "_model_name": "FloatProgressModel",
-            "bar_style": "success",
-            "max": 1042301,
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "value": 1042301,
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "orientation": "horizontal",
-            "min": 0,
-            "description_tooltip": null,
-            "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_48246be80e82429da2d48f9d4a1aaf0a"
-          }
-        },
-        "cf412ff73fc647908154abc9b2847f38": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "state": {
-            "_view_name": "HTMLView",
-            "style": "IPY_MODEL_5754900e885d4f509ede058b186fcab6",
-            "_dom_classes": [],
-            "description": "",
-            "_model_name": "HTMLModel",
-            "placeholder": "",
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "value": " 1.04M/1.04M [00:06&lt;00:00, 154kB/s]",
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "description_tooltip": null,
-            "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_d0434381119c46489e17fcbccd9755ea"
-          }
-        },
-        "6ebc21286ae843e5b9ba4df8f4cebfe0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_view_name": "StyleView",
-            "_model_name": "ProgressStyleModel",
-            "description_width": "initial",
-            "_view_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.2.0",
-            "bar_color": null,
-            "_model_module": "@jupyter-widgets/controls"
-          }
-        },
-        "48246be80e82429da2d48f9d4a1aaf0a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "5754900e885d4f509ede058b186fcab6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_view_name": "StyleView",
-            "_model_name": "DescriptionStyleModel",
-            "description_width": "",
-            "_view_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.2.0",
-            "_model_module": "@jupyter-widgets/controls"
-          }
-        },
-        "d0434381119c46489e17fcbccd9755ea": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "73c4b8bc05f64477aa03d767f4483795": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HBoxModel",
-          "state": {
-            "_view_name": "HBoxView",
-            "_dom_classes": [],
-            "_model_name": "HBoxModel",
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "box_style": "",
-            "layout": "IPY_MODEL_6123827ad5964b4b8a17aaca618b4768",
-            "_model_module": "@jupyter-widgets/controls",
-            "children": [
-              "IPY_MODEL_5327d425e74d4a599214282b9b70d58b",
-              "IPY_MODEL_974490d04f18407f9f5a5785b2802c0a"
-            ]
-          }
-        },
-        "6123827ad5964b4b8a17aaca618b4768": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "5327d425e74d4a599214282b9b70d58b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_view_name": "ProgressView",
-            "style": "IPY_MODEL_c3cc1723c39a4d74b2ab83bd23b5fcce",
-            "_dom_classes": [],
-            "description": "Downloading: 100%",
-            "_model_name": "FloatProgressModel",
-            "bar_style": "success",
-            "max": 456318,
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "value": 456318,
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "orientation": "horizontal",
-            "min": 0,
-            "description_tooltip": null,
-            "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_391d59bf8d2845f88a83dc25c7cf89f3"
-          }
-        },
-        "974490d04f18407f9f5a5785b2802c0a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "state": {
-            "_view_name": "HTMLView",
-            "style": "IPY_MODEL_d60fa9fe71444784b78bdfba6ed6a9e1",
-            "_dom_classes": [],
-            "description": "",
-            "_model_name": "HTMLModel",
-            "placeholder": "",
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "value": " 456k/456k [00:04&lt;00:00, 96.1kB/s]",
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "description_tooltip": null,
-            "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_41a3b55e5e264b85ada9558e5777790f"
-          }
-        },
-        "c3cc1723c39a4d74b2ab83bd23b5fcce": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_view_name": "StyleView",
-            "_model_name": "ProgressStyleModel",
-            "description_width": "initial",
-            "_view_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.2.0",
-            "bar_color": null,
-            "_model_module": "@jupyter-widgets/controls"
-          }
-        },
-        "391d59bf8d2845f88a83dc25c7cf89f3": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "d60fa9fe71444784b78bdfba6ed6a9e1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_view_name": "StyleView",
-            "_model_name": "DescriptionStyleModel",
-            "description_width": "",
-            "_view_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.2.0",
-            "_model_module": "@jupyter-widgets/controls"
-          }
-        },
-        "41a3b55e5e264b85ada9558e5777790f": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "aa4d6e2e9ac44e9bb40b7daccc91ee83": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HBoxModel",
-          "state": {
-            "_view_name": "HBoxView",
-            "_dom_classes": [],
-            "_model_name": "HBoxModel",
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "box_style": "",
-            "layout": "IPY_MODEL_c3b054972a6145d1ad03ca938a7ade9c",
-            "_model_module": "@jupyter-widgets/controls",
-            "children": [
-              "IPY_MODEL_644f4a69db534dd4a11172e5d010e8fe",
-              "IPY_MODEL_0e19c2d5b060490399efbfcda773e9ba"
-            ]
-          }
-        },
-        "c3b054972a6145d1ad03ca938a7ade9c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "644f4a69db534dd4a11172e5d010e8fe": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_view_name": "ProgressView",
-            "style": "IPY_MODEL_109479db406d4085acff84904cdac4ef",
-            "_dom_classes": [],
-            "description": "Downloading: 100%",
-            "_model_name": "FloatProgressModel",
-            "bar_style": "success",
-            "max": 1355256,
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "value": 1355256,
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "orientation": "horizontal",
-            "min": 0,
-            "description_tooltip": null,
-            "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_1fb7fd7b44bf4b3a9e949f025487ff47"
-          }
-        },
-        "0e19c2d5b060490399efbfcda773e9ba": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "state": {
-            "_view_name": "HTMLView",
-            "style": "IPY_MODEL_01c30370e3ff4b5caf9a3369841ad597",
-            "_dom_classes": [],
-            "description": "",
-            "_model_name": "HTMLModel",
-            "placeholder": "",
-            "_view_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "value": " 1.36M/1.36M [00:00&lt;00:00, 1.73MB/s]",
-            "_view_count": null,
-            "_view_module_version": "1.5.0",
-            "description_tooltip": null,
-            "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_d19eedc2acce48d4be7189b422b5fcb9"
-          }
-        },
-        "109479db406d4085acff84904cdac4ef": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_view_name": "StyleView",
-            "_model_name": "ProgressStyleModel",
-            "description_width": "initial",
-            "_view_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.2.0",
-            "bar_color": null,
-            "_model_module": "@jupyter-widgets/controls"
-          }
-        },
-        "1fb7fd7b44bf4b3a9e949f025487ff47": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        },
-        "01c30370e3ff4b5caf9a3369841ad597": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_view_name": "StyleView",
-            "_model_name": "DescriptionStyleModel",
-            "description_width": "",
-            "_view_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.5.0",
-            "_view_count": null,
-            "_view_module_version": "1.2.0",
-            "_model_module": "@jupyter-widgets/controls"
-          }
-        },
-        "d19eedc2acce48d4be7189b422b5fcb9": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "state": {
-            "_view_name": "LayoutView",
-            "grid_template_rows": null,
-            "right": null,
-            "justify_content": null,
-            "_view_module": "@jupyter-widgets/base",
-            "overflow": null,
-            "_model_module_version": "1.2.0",
-            "_view_count": null,
-            "flex_flow": null,
-            "width": null,
-            "min_width": null,
-            "border": null,
-            "align_items": null,
-            "bottom": null,
-            "_model_module": "@jupyter-widgets/base",
-            "top": null,
-            "grid_column": null,
-            "overflow_y": null,
-            "overflow_x": null,
-            "grid_auto_flow": null,
-            "grid_area": null,
-            "grid_template_columns": null,
-            "flex": null,
-            "_model_name": "LayoutModel",
-            "justify_items": null,
-            "grid_row": null,
-            "max_height": null,
-            "align_content": null,
-            "visibility": null,
-            "align_self": null,
-            "height": null,
-            "min_height": null,
-            "padding": null,
-            "grid_auto_rows": null,
-            "grid_gap": null,
-            "max_width": null,
-            "order": null,
-            "_view_module_version": "1.2.0",
-            "grid_template_areas": null,
-            "object_position": null,
-            "object_fit": null,
-            "grid_auto_columns": null,
-            "margin": null,
-            "display": null,
-            "left": null
-          }
-        }
-      }
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "hYCVkKKAwSjV"
-      },
-      "source": [
-        "%%capture\n",
-        "!pip install transformers\n",
-        "!pip install datasets\n",
-        "!pip install --upgrade git+https://github.com/google/flax.git"
-      ],
-      "execution_count": 1,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "2gcm5rxByOXO",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 164,
-          "referenced_widgets": [
-            "1b266a2c1cf646a392a46e39586282b3",
-            "8ecfcf14981c4d82b5d9d3839a496f0b",
-            "16b07572ac0d46798b2c2a292c3f9143",
-            "cf412ff73fc647908154abc9b2847f38",
-            "6ebc21286ae843e5b9ba4df8f4cebfe0",
-            "48246be80e82429da2d48f9d4a1aaf0a",
-            "5754900e885d4f509ede058b186fcab6",
-            "d0434381119c46489e17fcbccd9755ea",
-            "73c4b8bc05f64477aa03d767f4483795",
-            "6123827ad5964b4b8a17aaca618b4768",
-            "5327d425e74d4a599214282b9b70d58b",
-            "974490d04f18407f9f5a5785b2802c0a",
-            "c3cc1723c39a4d74b2ab83bd23b5fcce",
-            "391d59bf8d2845f88a83dc25c7cf89f3",
-            "d60fa9fe71444784b78bdfba6ed6a9e1",
-            "41a3b55e5e264b85ada9558e5777790f",
-            "aa4d6e2e9ac44e9bb40b7daccc91ee83",
-            "c3b054972a6145d1ad03ca938a7ade9c",
-            "644f4a69db534dd4a11172e5d010e8fe",
-            "0e19c2d5b060490399efbfcda773e9ba",
-            "109479db406d4085acff84904cdac4ef",
-            "1fb7fd7b44bf4b3a9e949f025487ff47",
-            "01c30370e3ff4b5caf9a3369841ad597",
-            "d19eedc2acce48d4be7189b422b5fcb9"
-          ]
-        },
-        "outputId": "5814323f-d04d-408c-e833-8522806ea73b"
-      },
-      "source": [
-        "import jax\n",
-        "from transformers.modeling_flax_utils import FlaxPreTrainedModel\n",
-        "import flax.linen as nn\n",
-        "import jax.numpy as jnp\n",
-        "from transformers import GPT2Config\n",
-        "from transformers import FlaxGPT2PreTrainedModel\n",
-        "from transformers import FlaxGPT2Model\n",
-        "from transformers import GPT2Tokenizer\n",
-        "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\",pad_token='<|endoftext|>')"
-      ],
-      "execution_count": 2,
-      "outputs": [
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "1b266a2c1cf646a392a46e39586282b3",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "73c4b8bc05f64477aa03d767f4483795",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "aa4d6e2e9ac44e9bb40b7daccc91ee83",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "GDokS6VEJI6C"
-      },
-      "source": [
-        "#inputs = tokenizer([\"JAX/Flax is amazing \",\"tensorflow is also good\"],[\"pytorch is better\",\"keras is the best\"],return_tensors='jax',padding='max_length',max_length=30)"
-      ],
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "hWiMk1TzyYim"
-      },
-      "source": [
-        "class FlaxGPT2ForMultipleChoiceModule(nn.Module):\n",
-        "  config:GPT2Config\n",
-        "  dtype: jnp.dtype = jnp.float32\n",
-        "  def setup(self):\n",
-        "    self.gpt2 = FlaxGPT2Model(config=self.config, dtype=self.dtype)\n",
-        "    self.dropout = nn.Dropout(rate=0.2)\n",
-        "    self.classifier = nn.Dense(4, dtype=self.dtype)\n",
-        "\n",
-        "  def __call__(self,input_ids,attention_mask,position_ids,return_dict=True,deterministic=True,*args):\n",
-        "    batch_size = input_ids.shape[0]\n",
-        "\n",
-        "    rng=jax.random.PRNGKey(0)\n",
-        "    _, dropout_rng = jax.random.split(rng)\n",
-        "\n",
-        "    outputs=self.gpt2(input_ids, attention_mask,position_ids,return_dict=return_dict)\n",
-        "    \n",
-        "\n",
-        "    hidden_states = outputs[0]\n",
-        "\n",
-        "    \n",
-        "    hidden_states= jnp.mean(hidden_states, axis=1)\n",
-        "\n",
-        "    print(hidden_states.shape)\n",
-        "    \n",
-        "    hidden_states=hidden_states.reshape(batch_size,-1)         #(32,8,768)->(32,8*768)\n",
-        "\n",
-        "    dropout_output = self.dropout(hidden_states,deterministic=deterministic,rng=dropout_rng)\n",
-        "\n",
-        "    print(dropout_output.shape)\n",
-        "\n",
-        "    logits = self.classifier(dropout_output)\n",
-        "    reshaped_logits = logits.reshape(-1, 4)                 #(32,4)\n",
-        "    if not return_dict:\n",
-        "      return (reshaped_logits,) + outputs[2:]\n",
-        "    return reshaped_logits"
-      ],
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "u1j00Ck255BC"
-      },
-      "source": [
-        "class FlaxGPT2ForMultipleChoice(FlaxGPT2PreTrainedModel):\n",
-        "    module_class = FlaxGPT2ForMultipleChoiceModule"
-      ],
-      "execution_count": 8,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "h2MrRgKTRxZO",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "5a0fcc68-ca39-4df0-c854-734125d65f53"
-      },
-      "source": [
-        "model = FlaxGPT2ForMultipleChoice.from_pretrained('gpt2') # getting warning"
-      ],
-      "execution_count": 9,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "(1, 768)\n",
-            "(1, 768)\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "Some weights of the model checkpoint at gpt2 were not used when initializing FlaxGPT2ForMultipleChoice: {('h', '1', 'ln_1', 'bias'), ('h', '6', 'ln_1', 'scale'), ('h', '1', 'attn', 'c_proj', 'kernel'), ('h', '11', 'mlp', 'c_fc', 'bias'), ('h', '7', 'ln_1', 'bias'), ('h', '5', 'ln_2', 'bias'), ('h', '10', 'ln_2', 'scale'), ('h', '4', 'mlp', 'c_proj', 'kernel'), ('h', '0', 'mlp', 'c_proj', 'bias'), ('h', '0', 'ln_1', 'bias'), ('h', '0', 'mlp', 'c_fc', 'kernel'), ('wpe', 'embedding'), ('h', '3', 'ln_1', 'scale'), ('h', '2', 'ln_1', 'scale'), ('h', '3', 'mlp', 'c_fc', 'kernel'), ('h', '7', 'ln_1', 'scale'), ('h', '8', 'mlp', 'c_proj', 'kernel'), ('h', '7', 'mlp', 'c_proj', 'kernel'), ('h', '3', 'ln_2', 'bias'), ('h', '9', 'attn', 'c_attn', 'kernel'), ('h', '0', 'mlp', 'c_fc', 'bias'), ('h', '3', 'attn', 'c_proj', 'bias'), ('h', '0', 'ln_1', 'scale'), ('h', '3', 'attn', 'c_attn', 'kernel'), ('h', '0', 'mlp', 'c_proj', 'kernel'), ('h', '5', 'ln_1', 'bias'), ('h', '7', 'attn', 'c_attn', 'bias'), ('h', '1', 'ln_2', 'bias'), ('h', '11', 'ln_2', 'scale'), ('h', '7', 'ln_2', 'bias'), ('h', '9', 'attn', 'c_proj', 'kernel'), ('h', '0', 'ln_2', 'bias'), ('h', '2', 'ln_2', 'scale'), ('h', '11', 'attn', 'c_attn', 'kernel'), ('h', '8', 'attn', 'c_proj', 'kernel'), ('h', '4', 'attn', 'c_attn', 'kernel'), ('h', '5', 'ln_1', 'scale'), ('h', '4', 'ln_1', 'bias'), ('h', '8', 'ln_2', 'bias'), ('h', '1', 'mlp', 'c_fc', 'kernel'), ('h', '9', 'ln_2', 'scale'), ('h', '1', 'mlp', 'c_proj', 'bias'), ('h', '2', 'mlp', 'c_proj', 'kernel'), ('h', '9', 'attn', 'c_proj', 'bias'), ('h', '11', 'ln_2', 'bias'), ('h', '6', 'mlp', 'c_proj', 'bias'), ('h', '3', 'ln_1', 'bias'), ('h', '1', 'attn', 'c_attn', 'kernel'), ('h', '9', 'ln_1', 'scale'), ('h', '10', 'attn', 'c_attn', 'bias'), ('h', '10', 'mlp', 'c_proj', 'kernel'), ('h', '2', 'attn', 'c_proj', 'kernel'), ('h', '0', 'attn', 'c_proj', 'kernel'), ('h', '6', 'attn', 'c_attn', 'kernel'), ('h', '4', 'mlp', 'c_fc', 'bias'), ('h', '3', 'attn', 'c_attn', 'bias'), ('h', '3', 'attn', 'c_proj', 'kernel'), ('h', '11', 'mlp', 'c_proj', 'bias'), ('h', '9', 'attn', 'c_attn', 'bias'), ('h', '7', 'mlp', 'c_proj', 'bias'), ('h', '7', 'mlp', 'c_fc', 'bias'), ('h', '6', 'attn', 'c_attn', 'bias'), ('h', '5', 'mlp', 'c_fc', 'kernel'), ('h', '0', 'attn', 'c_proj', 'bias'), ('h', '2', 'attn', 'c_proj', 'bias'), ('h', '10', 'attn', 'c_attn', 'kernel'), ('h', '10', 'mlp', 'c_proj', 'bias'), ('h', '1', 'attn', 'c_attn', 'bias'), ('h', '11', 'ln_1', 'bias'), ('h', '4', 'ln_2', 'bias'), ('h', '8', 'ln_1', 'bias'), ('h', '11', 'attn', 'c_proj', 'kernel'), ('h', '9', 'mlp', 'c_fc', 'kernel'), ('h', '7', 'ln_2', 'scale'), ('h', '9', 'mlp', 'c_proj', 'kernel'), ('h', '11', 'attn', 'c_attn', 'bias'), ('h', '10', 'mlp', 'c_fc', 'bias'), ('h', '6', 'attn', 'c_proj', 'kernel'), ('h', '0', 'ln_2', 'scale'), ('h', '2', 'ln_2', 'bias'), ('h', '3', 'mlp', 'c_proj', 'bias'), ('h', '5', 'mlp', 'c_proj', 'kernel'), ('h', '8', 'mlp', 'c_fc', 'bias'), ('h', '9', 'mlp', 'c_proj', 'bias'), ('h', '9', 'mlp', 'c_fc', 'bias'), ('h', '8', 'mlp', 'c_fc', 'kernel'), ('h', '9', 'ln_1', 'bias'), ('h', '10', 'ln_1', 'scale'), ('h', '6', 'ln_2', 'bias'), ('h', '2', 'mlp', 'c_fc', 'kernel'), ('h', '4', 'attn', 'c_proj', 'bias'), ('h', '1', 'ln_2', 'scale'), ('h', '5', 'mlp', 'c_fc', 'bias'), ('h', '7', 'mlp', 'c_fc', 'kernel'), ('h', '7', 'attn', 'c_proj', 'bias'), ('h', '5', 'attn', 'c_proj', 'kernel'), ('h', '2', 'mlp', 'c_fc', 'bias'), ('h', '6', 'ln_2', 'scale'), ('h', '11', 'ln_1', 'scale'), ('h', '4', 'mlp', 'c_fc', 'kernel'), ('h', '2', 'ln_1', 'bias'), ('h', '9', 'ln_2', 'bias'), ('h', '11', 'mlp', 'c_fc', 'kernel'), ('h', '1', 'attn', 'c_proj', 'bias'), ('h', '4', 'ln_2', 'scale'), ('h', '8', 'ln_1', 'scale'), ('h', '6', 'attn', 'c_proj', 'bias'), ('h', '5', 'attn', 'c_attn', 'kernel'), ('h', '3', 'ln_2', 'scale'), ('h', '8', 'attn', 'c_attn', 'bias'), ('h', '10', 'mlp', 'c_fc', 'kernel'), ('h', '1', 'ln_1', 'scale'), ('h', '10', 'attn', 'c_proj', 'bias'), ('h', '6', 'ln_1', 'bias'), ('h', '0', 'attn', 'c_attn', 'kernel'), ('wte', 'embedding'), ('h', '6', 'mlp', 'c_fc', 'kernel'), ('h', '4', 'attn', 'c_attn', 'bias'), ('h', '10', 'ln_2', 'bias'), ('h', '8', 'attn', 'c_proj', 'bias'), ('h', '11', 'attn', 'c_proj', 'bias'), ('h', '8', 'attn', 'c_attn', 'kernel'), ('h', '5', 'attn', 'c_attn', 'bias'), ('h', '5', 'ln_2', 'scale'), ('h', '2', 'attn', 'c_attn', 'bias'), ('ln_f', 'scale'), ('h', '7', 'attn', 'c_attn', 'kernel'), ('h', '4', 'ln_1', 'scale'), ('h', '8', 'ln_2', 'scale'), ('h', '11', 'mlp', 'c_proj', 'kernel'), ('h', '5', 'attn', 'c_proj', 'bias'), ('h', '7', 'attn', 'c_proj', 'kernel'), ('h', '8', 'mlp', 'c_proj', 'bias'), ('h', '3', 'mlp', 'c_fc', 'bias'), ('h', '10', 'ln_1', 'bias'), ('h', '2', 'attn', 'c_attn', 'kernel'), ('h', '6', 'mlp', 'c_proj', 'kernel'), ('h', '4', 'attn', 'c_proj', 'kernel'), ('h', '1', 'mlp', 'c_proj', 'kernel'), ('h', '2', 'mlp', 'c_proj', 'bias'), ('h', '1', 'mlp', 'c_fc', 'bias'), ('h', '4', 'mlp', 'c_proj', 'bias'), ('ln_f', 'bias'), ('h', '6', 'mlp', 'c_fc', 'bias'), ('h', '0', 'attn', 'c_attn', 'bias'), ('h', '10', 'attn', 'c_proj', 'kernel'), ('h', '5', 'mlp', 'c_proj', 'bias'), ('h', '3', 'mlp', 'c_proj', 'kernel')}\n",
-            "- This IS expected if you are initializing FlaxGPT2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-            "- This IS NOT expected if you are initializing FlaxGPT2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-            "Some weights of FlaxGPT2ForMultipleChoice were not initialized from the model checkpoint at gpt2 and are newly initialized: {('classifier', 'bias'), ('classifier', 'kernel')}\n",
-            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-          ],
-          "name": "stderr"
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "CdSuQK9pRmw-"
-      },
-      "source": [
-        "input_ids=jnp.ones((4,5,6))\n",
-        "attention_mask=jnp.ones((4,5,6))"
-      ],
-      "execution_count": 10,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "d3Bu38KTkwWs",
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 300
-        },
-        "outputId": "5470ccc9-6d49-427c-ad8e-5162343acfde"
-      },
-      "source": [
-        "out1 = model(input_ids, attention_mask) #GPT2 will not take (batch_size,num_choice,sequence_length)"
-      ],
-      "execution_count": 11,
-      "outputs": [
-        {
-          "output_type": "error",
-          "ename": "ValueError",
-          "evalue": "ignored",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-11-7491141e6756>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mout1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#GPT2 will not take (batch_size,num_choice,sequence_length)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_flax_gpt2.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_ids, attention_mask, position_ids, params, past_key_values, dropout_rng, train, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    370\u001b[0m         \u001b[0mreturn_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    371\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m         \u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msequence_length\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    373\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    374\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mposition_ids\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 2)"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "VZPlQfkhgLJd",
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "outputId": "7948688b-be77-4e9a-fc06-8644a2614d42"
-      },
-      "source": [
-        "print(out1)"
-      ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "[[ 1.1391759  -0.01598702  0.55463445  0.36025363]\n",
-            " [ 0.32208228  0.37667227  0.87823874  0.19541818]\n",
-            " [ 0.76971424  0.7187787   0.68642044 -0.31461257]\n",
-            " [ 1.2375658   0.03325981  0.00153449  0.12019679]]\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "fgkIcD-mZWP7"
-      },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}

Untitled330.ipynb DELETED Viewed

@@ -1,470 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Untitled330.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Ii2x731Ta8fu"
-      },
-      "source": [
-        "%%capture\n",
-        "!pip install transformers\n",
-        "!pip install datasets\n",
-        "!pip install --upgrade git+https://github.com/google/flax.git"
-      ],
-      "execution_count": 1,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "_9NMPFKua9hr"
-      },
-      "source": [
-        "import jax\n",
-        "from transformers.modeling_flax_utils import FlaxPreTrainedModel\n",
-        "import flax.linen as nn\n",
-        "import jax.numpy as jnp\n",
-        "from transformers import GPT2Config\n",
-        "#from transformers import FlaxGPT2PreTrainedModel\n",
-        "from transformers import FlaxGPT2Model\n",
-        "import jax.numpy as jnp\n",
-        "from transformers import GPT2Tokenizer\n",
-        "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\",pad_token='<|endoftext|>') \n",
-        "from typing import Any, Optional, Tuple\n",
-        "from flax.core.frozen_dict import FrozenDict, unfreeze\n",
-        "from transformers import file_utils\n",
-        "from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward\n",
-        "from transformers.models.gpt2.modeling_flax_gpt2  import  FlaxGPT2BlockCollection\n",
-        "from transformers.modeling_flax_outputs import FlaxBaseModelOutput"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "dqkcoBOccszd"
-      },
-      "source": [
-        "GPT2_START_DOCSTRING = r\"\"\"\n",
-        "    This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the\n",
-        "    generic methods the library implements for all its model (such as downloading or saving, resizing the input\n",
-        "    embeddings, pruning heads etc.)\n",
-        "    This model is also a Flax Linen `flax.nn.Module\n",
-        "    <https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html>`__ subclass. Use it as a regular Flax\n",
-        "    Module and refer to the Flax documentation for all matter related to general usage and behavior.\n",
-        "    Finally, this model supports inherent JAX features such as:\n",
-        "    - `Just-In-Time (JIT) compilation <https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit>`__\n",
-        "    - `Automatic Differentiation <https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation>`__\n",
-        "    - `Vectorization <https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap>`__\n",
-        "    - `Parallelization <https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap>`__\n",
-        "    Parameters:\n",
-        "        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.\n",
-        "            Initializing with a config file does not load the weights associated with the model, only the\n",
-        "            configuration. Check out the :meth:`~transformers.FlaxPreTrainedModel.from_pretrained` method to load the\n",
-        "            model weights.\n",
-        "\"\"\"\n",
-        "\n",
-        "GPT2_INPUTS_DOCSTRING = r\"\"\"\n",
-        "    Args:\n",
-        "        input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size,input_ids_length)`):\n",
-        "            :obj:`input_ids_length` = ``sequence_length``. Indices of input sequence tokens in the vocabulary.\n",
-        "            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See\n",
-        "            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for\n",
-        "            details.\n",
-        "            `What are input IDs? <../glossary.html#input-ids>`__\n",
-        "        attention_mask (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):\n",
-        "            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:\n",
-        "            - 1 for tokens that are **not masked**,\n",
-        "            - 0 for tokens that are **masked**.\n",
-        "            `What are attention masks? <../glossary.html#attention-mask>`__\n",
-        "        position_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):\n",
-        "            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,\n",
-        "            config.max_position_embeddings - 1]``.\n",
-        "        past_key_values (:obj:`Dict[str, np.ndarray]`, `optional`, returned by ``init_cache`` or when passing previous ``past_key_values``):\n",
-        "            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast\n",
-        "            auto-regressive decoding. Pre-computed key and value hidden-states are of shape `[batch_size, max_length]`.\n",
-        "        output_attentions (:obj:`bool`, `optional`):\n",
-        "            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned\n",
-        "            tensors for more detail.\n",
-        "        output_hidden_states (:obj:`bool`, `optional`):\n",
-        "            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for\n",
-        "            more detail.\n",
-        "        return_dict (:obj:`bool`, `optional`):\n",
-        "            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.\n",
-        "\"\"\""
-      ],
-      "execution_count": 3,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "NX-Z5iCMbKL5"
-      },
-      "source": [
-        "class FlaxGGGPreTrainedModel(FlaxPreTrainedModel):\n",
-        "    \"\"\"\n",
-        "    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained\n",
-        "    models.\n",
-        "    \"\"\"\n",
-        "\n",
-        "    config_class = GPT2Config\n",
-        "    base_model_prefix = \"transformer\"\n",
-        "    module_class: nn.Module = None\n",
-        "\n",
-        "    def __init__(\n",
-        "        self,\n",
-        "        config: GPT2Config,\n",
-        "        input_shape: Tuple = (1,1),\n",
-        "        seed: int = 0,\n",
-        "        dtype: jnp.dtype = jnp.float32,\n",
-        "        **kwargs,\n",
-        "    ):\n",
-        "        \n",
-        "        module = self.module_class(config=config, dtype=dtype, **kwargs)\n",
-        "        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)\n",
-        "\n",
-        "    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:\n",
-        "        # init input tensors\n",
-        "        input_ids = jnp.zeros(input_shape, dtype=\"i4\")\n",
-        "        attention_mask = jnp.ones_like(input_ids)\n",
-        "        \n",
-        "        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)\n",
-        "\n",
-        "        params_rng, dropout_rng = jax.random.split(rng)\n",
-        "        rngs = {\"params\": params_rng, \"dropout\": dropout_rng}\n",
-        "\n",
-        "        return self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)[\"params\"]\n",
-        "\n",
-        "    def init_cache(self, batch_size, max_length):\n",
-        "        r\"\"\"\n",
-        "        Args:\n",
-        "            batch_size (:obj:`int`):\n",
-        "                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.\n",
-        "            max_length (:obj:`int`):\n",
-        "                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized\n",
-        "                cache.\n",
-        "        \"\"\"\n",
-        "        # init input variables to retrieve cache\n",
-        "        input_ids = jnp.ones((batch_size, max_length))\n",
-        "        attention_mask = jnp.ones_like(input_ids)\n",
-        "        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)\n",
-        "\n",
-        "        init_variables = self.module.init(\n",
-        "            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True\n",
-        "        )\n",
-        "        return init_variables[\"cache\"]\n",
-        "\n",
-        "    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n",
-        "    def __call__(\n",
-        "        self,\n",
-        "        input_ids,\n",
-        "        attention_mask=None,\n",
-        "        position_ids=None,\n",
-        "        params: dict = None,\n",
-        "        past_key_values: dict = None,\n",
-        "        dropout_rng: jax.random.PRNGKey = None,\n",
-        "        train: bool = False,\n",
-        "        output_attentions: Optional[bool] = None,\n",
-        "        output_hidden_states: Optional[bool] = None,\n",
-        "        return_dict: Optional[bool] = None,\n",
-        "    ):\n",
-        "        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions\n",
-        "        output_hidden_states = (\n",
-        "            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states\n",
-        "        )\n",
-        "        return_dict = return_dict if return_dict is not None else self.config.return_dict\n",
-        "        print(input_ids.shape)\n",
-        "\n",
-        "     #   batch_size, num_choices,sequence_length = input_ids.shape\n",
-        "\n",
-        "        if position_ids is None:\n",
-        "            if past_key_values is not None:\n",
-        "                raise ValueError(\"Make sure to provide `position_ids` when passing `past_key_values`.\")\n",
-        "            \n",
-        "            position_ids=jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)\n",
-        "\n",
-        "        #    position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))\n",
-        "\n",
-        "        if attention_mask is None:\n",
-        "            attention_mask = jnp.ones((input_ids))\n",
-        "        print('attn not')\n",
-        "\n",
-        "        # Handle any PRNG if needed\n",
-        "        rngs = {}\n",
-        "        if dropout_rng is not None:\n",
-        "            rngs[\"dropout\"] = dropout_rng\n",
-        "\n",
-        "        inputs = {\"params\": params or self.params}\n",
-        "\n",
-        "        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPT2Attention module\n",
-        "        if past_key_values:\n",
-        "            inputs[\"cache\"] = past_key_values\n",
-        "            mutable = [\"cache\"]\n",
-        "        else:\n",
-        "            mutable = False\n",
-        "\n",
-        "        outputs = self.module.apply(\n",
-        "            inputs,\n",
-        "            jnp.array(input_ids, dtype=\"i4\"),\n",
-        "            jnp.array(attention_mask, dtype=\"i4\"),\n",
-        "            jnp.array(position_ids, dtype=\"i4\"),\n",
-        "            not train,\n",
-        "            False,\n",
-        "            output_attentions,\n",
-        "            output_hidden_states,\n",
-        "            return_dict,\n",
-        "            rngs=rngs,\n",
-        "            mutable=mutable,\n",
-        "        )\n",
-        "        print('cache')\n",
-        "\n",
-        "        # add updated cache to model output\n",
-        "        if past_key_values is not None and return_dict:\n",
-        "            outputs, past_key_values = outputs\n",
-        "            outputs[\"past_key_values\"] = unfreeze(past_key_values[\"cache\"])\n",
-        "            return outputs\n",
-        "        elif past_key_values is not None and not return_dict:\n",
-        "            outputs, past_key_values = outputs\n",
-        "            outputs = outputs[:1] + (unfreeze(past_key_values[\"cache\"]),) + outputs[1:]\n",
-        "\n",
-        "        return outputs"
-      ],
-      "execution_count": 6,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "4vRAWll2bwQQ"
-      },
-      "source": [
-        "class FlaxGGGModule(nn.Module):\n",
-        "    config: GPT2Config\n",
-        "    dtype: jnp.dtype = jnp.float32\n",
-        "\n",
-        "    def setup(self):\n",
-        "        self.embed_dim = self.config.hidden_size\n",
-        "\n",
-        "        self.wte = nn.Embed(\n",
-        "            self.config.vocab_size,\n",
-        "            self.embed_dim,\n",
-        "            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),\n",
-        "            dtype=self.dtype,\n",
-        "        )\n",
-        "        self.wpe = nn.Embed(\n",
-        "            self.config.max_position_embeddings,\n",
-        "            self.embed_dim,\n",
-        "            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),\n",
-        "            dtype=self.dtype,\n",
-        "        )\n",
-        "        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)\n",
-        "        self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype)\n",
-        "        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)\n",
-        "\n",
-        "    def __call__(\n",
-        "        self,\n",
-        "        input_ids,\n",
-        "        attention_mask,\n",
-        "        position_ids,\n",
-        "        deterministic=True,\n",
-        "        init_cache: bool = False,\n",
-        "        output_attentions: bool = False,\n",
-        "        output_hidden_states: bool = False,\n",
-        "        return_dict: bool = True,\n",
-        "    ):\n",
-        "        input_embeds = self.wte(input_ids.astype(\"i4\"))\n",
-        "        position_embeds = self.wpe(position_ids.astype(\"i4\"))\n",
-        "        \n",
-        "\n",
-        "        hidden_states = input_embeds + position_embeds\n",
-        "        hidden_states = self.dropout(hidden_states, deterministic=deterministic)\n",
-        "        outputs = self.h(\n",
-        "            hidden_states,\n",
-        "            attention_mask,\n",
-        "            deterministic=deterministic,\n",
-        "            init_cache=init_cache,\n",
-        "            output_attentions=output_attentions,\n",
-        "            output_hidden_states=output_hidden_states,\n",
-        "            return_dict=return_dict,\n",
-        "        )\n",
-        "\n",
-        "        hidden_states = outputs[0]\n",
-        "        hidden_states = self.ln_f(hidden_states)\n",
-        "        print('ggg')\n",
-        "        if not return_dict:\n",
-        "            return (hidden_states,) + outputs[1:]\n",
-        "\n",
-        "        return FlaxBaseModelOutput(\n",
-        "            last_hidden_state=hidden_states,\n",
-        "            hidden_states=outputs.hidden_states,\n",
-        "            attentions=outputs.attentions,)\n",
-        "class FlaxNewModel(FlaxGGGPreTrainedModel):\n",
-        "    module_class = FlaxGGGModule"
-      ],
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "_ljSn6GdedtI"
-      },
-      "source": [
-        "class FlaxGPT2ForMultipleChoiceModule(nn.Module):\n",
-        "  config:GPT2Config\n",
-        "  dtype: jnp.dtype = jnp.float32\n",
-        "  def setup(self):\n",
-        "    self.gpt2 = FlaxNewModel(config=self.config, dtype=self.dtype)\n",
-        "    self.dropout = nn.Dropout(rate=0.2)\n",
-        "    self.classifier = nn.Dense(4, dtype=self.dtype)\n",
-        "\n",
-        "  def __call__(self,input_ids,attention_mask,position_ids,return_dict=True,deterministic=True,*args):\n",
-        "    batch_size = input_ids.shape[0]\n",
-        "    rng=jax.random.PRNGKey(0)\n",
-        "    _, dropout_rng = jax.random.split(rng)\n",
-        "    print('abc')\n",
-        "\n",
-        "    outputs=self.gpt2(input_ids, attention_mask,position_ids,return_dict=return_dict)\n",
-        "    \n",
-        "\n",
-        "    hidden_states = outputs[0]\n",
-        "\n",
-        "    \n",
-        "    hidden_states= jnp.mean(hidden_states, axis=1)\n",
-        "\n",
-        "    print(hidden_states.shape)\n",
-        "    \n",
-        "    \n",
-        "    hidden_states=hidden_states.reshape(batch_size,-1)         #(32,8,768)->(32,8*768)\n",
-        "\n",
-        "    dropout_output = self.dropout(hidden_states,deterministic=deterministic,rng=dropout_rng)\n",
-        "\n",
-        "    print(dropout_output.shape)\n",
-        "    \n",
-        "\n",
-        "    logits = self.classifier(dropout_output)\n",
-        "    print('bnv')\n",
-        "    reshaped_logits = logits.reshape(-1, 4)   \n",
-        "                  #(32,4)\n",
-        "    if not return_dict:\n",
-        "      return (reshaped_logits,) + outputs[2:]\n",
-        "    return reshaped_logits"
-      ],
-      "execution_count": 8,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "M4UPf3Waexq0"
-      },
-      "source": [
-        "class FlaxGPT2ForMultipleChoice(FlaxNewModel):\n",
-        "    module_class = FlaxGPT2ForMultipleChoiceModule"
-      ],
-      "execution_count": 9,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "roQ3vls4e4TH"
-      },
-      "source": [
-        "model = FlaxGPT2ForMultipleChoice.from_pretrained('gpt2')"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "E9qOSaaie417"
-      },
-      "source": [
-        "input_ids=jnp.ones((1,2,11))\n",
-        "attention_mask=jnp.ones((1,2,11))"
-      ],
-      "execution_count": 12,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 409
-        },
-        "id": "am7hYv8auWVy",
-        "outputId": "0c8192ca-a0ab-432e-d483-46f8a2cc2576"
-      },
-      "source": [
-        "out1 = model(input_ids, attention_mask)"
-      ],
-      "execution_count": 13,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "(1, 2, 11)\n",
-            "attn not\n",
-            "ggg\n",
-            "abc\n",
-            "(1, 2, 11)\n",
-            "attn not\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "error",
-          "ename": "ValueError",
-          "evalue": "ignored",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-13-6be36035677e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mout1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-            "\u001b[0;32m<ipython-input-6-de553f26d169>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_ids, attention_mask, position_ids, params, past_key_values, dropout_rng, train, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    112\u001b[0m             \u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    113\u001b[0m             \u001b[0mrngs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrngs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 114\u001b[0;31m             \u001b[0mmutable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutable\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    115\u001b[0m         )\n\u001b[1;32m    116\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cache'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, variables, rngs, method, mutable, capture_intermediates, *args, **kwargs)\u001b[0m\n\u001b[1;32m    964\u001b[0m         \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    965\u001b[0m         \u001b[0mmutable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcapture_intermediates\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcapture_intermediates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m     )(variables, *args, **kwargs, rngs=rngs)\n\u001b[0m\u001b[1;32m    967\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    968\u001b[0m   def init_with_output(self,\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/core/scope.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(variables, rngs, *args, **kwargs)\u001b[0m\n\u001b[1;32m    685\u001b[0m               **kwargs) -> Union[Any, Tuple[Any, VariableDict]]:\n\u001b[1;32m    686\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvariables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrngs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrngs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmutable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtemporary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    688\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mmutable\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    689\u001b[0m       \u001b[0;32mreturn\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmutable_variables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mscope_fn\u001b[0;34m(scope, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1214\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcapture_intermediates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1215\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1216\u001b[0;31m       \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscope\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1217\u001b[0m     \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1218\u001b[0m       \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mwrapped_module_method\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    284\u001b[0m       \u001b[0;32mif\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    285\u001b[0m         \u001b[0mfilter_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-8-2c21e4c966c8>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_ids, attention_mask, position_ids, return_dict, deterministic, *args)\u001b[0m\n\u001b[1;32m     13\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'abc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m     \u001b[0moutputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgpt2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-6-de553f26d169>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_ids, attention_mask, position_ids, params, past_key_values, dropout_rng, train, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    112\u001b[0m             \u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    113\u001b[0m             \u001b[0mrngs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrngs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 114\u001b[0;31m             \u001b[0mmutable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutable\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    115\u001b[0m         )\n\u001b[1;32m    116\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'cache'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, variables, rngs, method, mutable, capture_intermediates, *args, **kwargs)\u001b[0m\n\u001b[1;32m    964\u001b[0m         \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    965\u001b[0m         \u001b[0mmutable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcapture_intermediates\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcapture_intermediates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m     )(variables, *args, **kwargs, rngs=rngs)\n\u001b[0m\u001b[1;32m    967\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    968\u001b[0m   def init_with_output(self,\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/core/scope.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(variables, rngs, *args, **kwargs)\u001b[0m\n\u001b[1;32m    685\u001b[0m               **kwargs) -> Union[Any, Tuple[Any, VariableDict]]:\n\u001b[1;32m    686\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvariables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrngs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrngs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmutable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmutable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtemporary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    688\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mmutable\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    689\u001b[0m       \u001b[0;32mreturn\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmutable_variables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mscope_fn\u001b[0;34m(scope, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1214\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcapture_intermediates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1215\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1216\u001b[0;31m       \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscope\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1217\u001b[0m     \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1218\u001b[0m       \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mwrapped_module_method\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    284\u001b[0m       \u001b[0;32mif\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    285\u001b[0m         \u001b[0mfilter_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m<ipython-input-7-b2eaa3f7b251>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, input_ids, attention_mask, position_ids, deterministic, init_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m     47\u001b[0m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     48\u001b[0m             \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 49\u001b[0;31m             \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     50\u001b[0m         )\n\u001b[1;32m     51\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mwrapped_module_method\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    284\u001b[0m       \u001b[0;32mif\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    285\u001b[0m         \u001b[0mfilter_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_flax_gpt2.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, hidden_states, attention_mask, deterministic, init_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    452\u001b[0m                 \u001b[0mdeterministic\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdeterministic\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    453\u001b[0m                 \u001b[0minit_cache\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minit_cache\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 454\u001b[0;31m                 \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    455\u001b[0m             )\n\u001b[1;32m    456\u001b[0m             \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlayer_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mwrapped_module_method\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    284\u001b[0m       \u001b[0;32mif\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    285\u001b[0m         \u001b[0mfilter_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_flax_gpt2.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, hidden_states, attention_mask, deterministic, init_cache, output_attentions)\u001b[0m\n\u001b[1;32m    285\u001b[0m             \u001b[0mdeterministic\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdeterministic\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    286\u001b[0m             \u001b[0minit_cache\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minit_cache\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 287\u001b[0;31m             \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    288\u001b[0m         )\n\u001b[1;32m    289\u001b[0m         \u001b[0;31m# residual connection\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/flax/linen/module.py\u001b[0m in \u001b[0;36mwrapped_module_method\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m     \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m       \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    284\u001b[0m       \u001b[0;32mif\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    285\u001b[0m         \u001b[0mfilter_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcapture_stack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_flax_gpt2.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, hidden_states, attention_mask, deterministic, init_cache, output_attentions)\u001b[0m\n\u001b[1;32m    177\u001b[0m     ):\n\u001b[1;32m    178\u001b[0m         \u001b[0mqkv_out\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_attn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m         \u001b[0mquery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqkv_out\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    180\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    181\u001b[0m         \u001b[0mquery\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_split_heads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/jax/_src/numpy/lax_numpy.py\u001b[0m in \u001b[0;36msplit\u001b[0;34m(ary, indices_or_sections, axis)\u001b[0m\n\u001b[1;32m   1806\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0m_wraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1807\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindices_or_sections\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1808\u001b[0;31m   \u001b[0;32mreturn\u001b[0m \u001b[0m_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"split\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindices_or_sections\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1809\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1810\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_split_on_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp_fun\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/jax/_src/numpy/lax_numpy.py\u001b[0m in \u001b[0;36m_split\u001b[0;34m(op, ary, indices_or_sections, axis)\u001b[0m\n\u001b[1;32m   1798\u001b[0m            + ((r + 1) * (part_size + 1) - 1)])\n\u001b[1;32m   1799\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1800\u001b[0;31m       \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"array split does not result in an equal division\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1801\u001b[0m   \u001b[0mstarts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mends\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mndim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1802\u001b[0m   \u001b[0m_subval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msubvals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-            "\u001b[0;31mValueError\u001b[0m: array split does not result in an equal division"
-          ]
-        }
-      ]
-    }
-  ]
-}