{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "114bfeb2ab21477397ff356a8d50d678": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_6c2a683959b74f06b71a21f62dc9d732", "IPY_MODEL_bfd850f32c4748cb9fc21e3f67a1e079", "IPY_MODEL_5405a44a0740459c9b2dc317f7e32af2" ], "layout": "IPY_MODEL_5ced55e2613f4ca09c576df4936d444d" } }, "6c2a683959b74f06b71a21f62dc9d732": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3d205afcaa37416dbafa166e7f6bb6ba", "placeholder": "​", "style": "IPY_MODEL_25969d182a7a4b569625efbb6e619bab", "value": "Downloading (…)okenizer_config.json: 100%" } }, "bfd850f32c4748cb9fc21e3f67a1e079": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_29c82e0054084c54bb58af9c3518017e", "max": 234, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_f969ec4a051d4a5ba29d285ed3f7f8f2", "value": 234 } }, "5405a44a0740459c9b2dc317f7e32af2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c3f0dd842604416bb3fdfb40d79d57c1", "placeholder": "​", "style": "IPY_MODEL_85d7256713914254b5ba39ef2eee37bb", "value": " 234/234 [00:00<00:00, 8.59kB/s]" } }, "5ced55e2613f4ca09c576df4936d444d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3d205afcaa37416dbafa166e7f6bb6ba": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "25969d182a7a4b569625efbb6e619bab": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "29c82e0054084c54bb58af9c3518017e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f969ec4a051d4a5ba29d285ed3f7f8f2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c3f0dd842604416bb3fdfb40d79d57c1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "85d7256713914254b5ba39ef2eee37bb": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9826e15254ca413eb37dc7f8837f6827": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8d9500746f61474e81bcf51beaba54e1", "IPY_MODEL_41b89216ec9544af91b8b7f7e2f4664e", "IPY_MODEL_1538103aa26d41459b67843f917c5a1a" ], "layout": "IPY_MODEL_48bd08d79550483f84b241127e31878c" } }, "8d9500746f61474e81bcf51beaba54e1": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b4f2ddbbf9de4898a7830c79ca2e20d7", "placeholder": "​", "style": "IPY_MODEL_c9ddb75f29af47b38f79b09792116673", "value": "Downloading (…)olve/main/vocab.json: 100%" } }, "41b89216ec9544af91b8b7f7e2f4664e": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6f3eba08a72348df89642e7eee1fec30", "max": 1042301, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4ae0e50ecd7f424dac6c9c2c5aa62696", "value": 1042301 } }, "1538103aa26d41459b67843f917c5a1a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a71e6379e60c46b5915f0ecd097a2a33", "placeholder": "​", "style": "IPY_MODEL_0a780d58219d46978fad719257430d5f", "value": " 1.04M/1.04M [00:00<00:00, 3.15MB/s]" } }, "48bd08d79550483f84b241127e31878c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b4f2ddbbf9de4898a7830c79ca2e20d7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c9ddb75f29af47b38f79b09792116673": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6f3eba08a72348df89642e7eee1fec30": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4ae0e50ecd7f424dac6c9c2c5aa62696": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "a71e6379e60c46b5915f0ecd097a2a33": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0a780d58219d46978fad719257430d5f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e685a0136ffa4280b0a49263c4f842d9": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_845a84b8f7344a468e0b0b72f0708378", "IPY_MODEL_a56823a90ee64b8fb87b2d2ba8857e53", "IPY_MODEL_b6698a0e0ca2409daf771b0d0a419e3a" ], "layout": "IPY_MODEL_c8ac228076fa4fb88ff37e2f3c34be93" } }, "845a84b8f7344a468e0b0b72f0708378": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f16775e4439a4c4f886c4c4027d30832", "placeholder": "​", "style": "IPY_MODEL_3b40ee2e5b60499e81f8c79a6383b2ec", "value": "Downloading (…)olve/main/merges.txt: 100%" } }, "a56823a90ee64b8fb87b2d2ba8857e53": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9a1f2210fd1e47d3ba66cf0a99f937b8", "max": 456318, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_fc2aa788745848c8b363c44e7d07116b", "value": 456318 } }, "b6698a0e0ca2409daf771b0d0a419e3a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d9ea0947db244309b8effbfe53267042", "placeholder": "​", "style": "IPY_MODEL_f52cccc762dc4c419671b2aade6a3506", "value": " 456k/456k [00:00<00:00, 2.84MB/s]" } }, "c8ac228076fa4fb88ff37e2f3c34be93": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f16775e4439a4c4f886c4c4027d30832": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3b40ee2e5b60499e81f8c79a6383b2ec": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9a1f2210fd1e47d3ba66cf0a99f937b8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fc2aa788745848c8b363c44e7d07116b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "d9ea0947db244309b8effbfe53267042": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f52cccc762dc4c419671b2aade6a3506": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9c7c0a7882964d8ca8031ac18af93253": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_f5f57b4e23174d0fa8142dcb57207720", "IPY_MODEL_50f474df9ab147f7ba094c1cdb2ba69e", "IPY_MODEL_35c204e247044a11a9ee871cfbcbe430" ], "layout": "IPY_MODEL_8596f49e4d594ae4a0e848608c7d122f" } }, "f5f57b4e23174d0fa8142dcb57207720": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_874be31d42ce487697133b07065f815d", "placeholder": "​", "style": "IPY_MODEL_41d87a1c6a6a4d32ac20f35f779207d5", "value": "Downloading (…)cial_tokens_map.json: 100%" } }, "50f474df9ab147f7ba094c1cdb2ba69e": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b7ab4be64f8b487893997a5d57eb9f09", "max": 99, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_8620138609a24a3aae22da774fb32bdb", "value": 99 } }, "35c204e247044a11a9ee871cfbcbe430": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a76b7ee703ed4ed08a1e0e1a708aa117", "placeholder": "​", "style": "IPY_MODEL_44aa877b3a844922a1ef50809713fe32", "value": " 99.0/99.0 [00:00<00:00, 2.91kB/s]" } }, "8596f49e4d594ae4a0e848608c7d122f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "874be31d42ce487697133b07065f815d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "41d87a1c6a6a4d32ac20f35f779207d5": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b7ab4be64f8b487893997a5d57eb9f09": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8620138609a24a3aae22da774fb32bdb": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "a76b7ee703ed4ed08a1e0e1a708aa117": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "44aa877b3a844922a1ef50809713fe32": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "21f3adc7a34149d88b0e67b537eec488": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_61c7dc85009b4ab0968b8222a49297a1", "IPY_MODEL_194d14e6d61f47a28f7615f0c0071831", "IPY_MODEL_1e5eea17dab34a18a3e8fafe6988ad90" ], "layout": "IPY_MODEL_630b9cb70bed4900b97cfa3c476d6ca0" } }, "61c7dc85009b4ab0968b8222a49297a1": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7d0e80dbc0dc4431ba3eacc2ed719e2e", "placeholder": "​", "style": "IPY_MODEL_46561fc96ed645e1bfc4039a222b2f66", "value": "Downloading (…)lve/main/config.json: 100%" } }, "194d14e6d61f47a28f7615f0c0071831": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_162e24639a1d462eaa67e8e8040ca879", "max": 1242, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_9ab05a9fc9134fe19ef600b8cde42b78", "value": 1242 } }, "1e5eea17dab34a18a3e8fafe6988ad90": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b80761f44a4e4c889e3acfa06852c064", "placeholder": "​", "style": "IPY_MODEL_713e206a8c894bcf965cb96a97f30be7", "value": " 1.24k/1.24k [00:00<00:00, 35.4kB/s]" } }, "630b9cb70bed4900b97cfa3c476d6ca0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7d0e80dbc0dc4431ba3eacc2ed719e2e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "46561fc96ed645e1bfc4039a222b2f66": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "162e24639a1d462eaa67e8e8040ca879": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9ab05a9fc9134fe19ef600b8cde42b78": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b80761f44a4e4c889e3acfa06852c064": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "713e206a8c894bcf965cb96a97f30be7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "85a1329fbf9e44c8b708a1ad868691d7": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_9be2d523ce7e4c2daeed4f4846f16cff", "IPY_MODEL_3321f3d71e684ca4afa48a1e0b889ebd", "IPY_MODEL_adb1ff19806c4d9abfef424270408759" ], "layout": "IPY_MODEL_d7a5976a0b6b4a62913bdf148d06a6bf" } }, "9be2d523ce7e4c2daeed4f4846f16cff": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_efed82b2f6f84516989d07f9515a102d", "placeholder": "​", "style": "IPY_MODEL_4f6b53cc236b448093d23240bd28a6d8", "value": "Downloading (…)onfiguration_btlm.py: 100%" } }, "3321f3d71e684ca4afa48a1e0b889ebd": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b7821089bfe94594982736d3005b82e9", "max": 7584, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_d6f016012139482b9ffc48407a0e7cb1", "value": 7584 } }, "adb1ff19806c4d9abfef424270408759": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_eee7b8b660f74f0e9e0634a17b29ecb1", "placeholder": "​", "style": "IPY_MODEL_6296781e8cc94316a44c5a0a0a77f46c", "value": " 7.58k/7.58k [00:00<00:00, 221kB/s]" } }, "d7a5976a0b6b4a62913bdf148d06a6bf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "efed82b2f6f84516989d07f9515a102d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4f6b53cc236b448093d23240bd28a6d8": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b7821089bfe94594982736d3005b82e9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d6f016012139482b9ffc48407a0e7cb1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "eee7b8b660f74f0e9e0634a17b29ecb1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6296781e8cc94316a44c5a0a0a77f46c": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0c4007f367744ccf80f3c9af09af53de": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_87a60648650a41fa9b3573be98e4adad", "IPY_MODEL_0cbd3f72373544b8a946c70627e515f9", "IPY_MODEL_d0f8fa0a331b4da98c0b55311ea073a0" ], "layout": "IPY_MODEL_a312c5e9461d470ca8c3cd2184a7ee09" } }, "87a60648650a41fa9b3573be98e4adad": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8980f61b2d4b4164a21b596ce515f6e3", "placeholder": "​", "style": "IPY_MODEL_d58f9dcf2c7b4fa887702b231ba0ddd1", "value": "Downloading (…)ain/modeling_btlm.py: 100%" } }, "0cbd3f72373544b8a946c70627e515f9": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_36cafb73c9fd4bb486853771b146fb44", "max": 71508, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a31d4ba8182a41f7a164a87e0492bfe2", "value": 71508 } }, "d0f8fa0a331b4da98c0b55311ea073a0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_48584e07c5074502acc46680e17b6dd8", "placeholder": "​", "style": "IPY_MODEL_7731d185d2d0401c9ce8dd045ffc187a", "value": " 71.5k/71.5k [00:00<00:00, 935kB/s]" } }, "a312c5e9461d470ca8c3cd2184a7ee09": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8980f61b2d4b4164a21b596ce515f6e3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d58f9dcf2c7b4fa887702b231ba0ddd1": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "36cafb73c9fd4bb486853771b146fb44": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a31d4ba8182a41f7a164a87e0492bfe2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "48584e07c5074502acc46680e17b6dd8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7731d185d2d0401c9ce8dd045ffc187a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b5f78d5cabff40ca8749376141ea8d23": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8e4700723e894b7380862d08a34afe9d", "IPY_MODEL_3b1d6cbf8b2b4332a3fdbe2a5a755116", "IPY_MODEL_5f52ddd679124bbfb141e10103b5ac42" ], "layout": "IPY_MODEL_289ac602411348498cd757b2a6b53a49" } }, "8e4700723e894b7380862d08a34afe9d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fdfb46b541434685b1b0cf11c2fb8597", "placeholder": "​", "style": "IPY_MODEL_c45db1178d344416b5a107048f482a3e", "value": "Downloading pytorch_model.bin: 100%" } }, "3b1d6cbf8b2b4332a3fdbe2a5a755116": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b8ffc780321f405592a9112487316d31", "max": 5292652837, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_b24b1532289f4ed99e3b645248aff802", "value": 5292652837 } }, "5f52ddd679124bbfb141e10103b5ac42": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6d924d2d6820449daf637270470fa50b", "placeholder": "​", "style": "IPY_MODEL_ca9129ae04054d419c5a2c5b8f5b8364", "value": " 5.29G/5.29G [00:33<00:00, 215MB/s]" } }, "289ac602411348498cd757b2a6b53a49": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fdfb46b541434685b1b0cf11c2fb8597": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c45db1178d344416b5a107048f482a3e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b8ffc780321f405592a9112487316d31": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b24b1532289f4ed99e3b645248aff802": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "6d924d2d6820449daf637270470fa50b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ca9129ae04054d419c5a2c5b8f5b8364": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "8bf00751b22e4aae8bb5e715940dea8f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b996c1623b584cdeaa79a68fcb3f6535", "IPY_MODEL_53b6219000d541a2ab09b807084e33da", "IPY_MODEL_2714c5ad04fb4621baf8affc9795027a" ], "layout": "IPY_MODEL_d6200cca0bab43e9b24a86be582833d9" } }, "b996c1623b584cdeaa79a68fcb3f6535": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a8085708e61f4f08a9f90b881d957e88", "placeholder": "​", "style": "IPY_MODEL_cc839ad647d94e2fac9bffb8e3e1c421", "value": "Downloading (…)neration_config.json: 100%" } }, "53b6219000d541a2ab09b807084e33da": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0eab7e2aafd14b96bb601b11fbf6af59", "max": 119, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_bff03d7ae4f04a55b0ba015fe9b70c54", "value": 119 } }, "2714c5ad04fb4621baf8affc9795027a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c95ee59f813e4ccaad4c845fe2c67512", "placeholder": "​", "style": "IPY_MODEL_823526b63b3d4598a456955884ba7293", "value": " 119/119 [00:00<00:00, 5.35kB/s]" } }, "d6200cca0bab43e9b24a86be582833d9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a8085708e61f4f08a9f90b881d957e88": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cc839ad647d94e2fac9bffb8e3e1c421": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0eab7e2aafd14b96bb601b11fbf6af59": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bff03d7ae4f04a55b0ba015fe9b70c54": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c95ee59f813e4ccaad4c845fe2c67512": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "823526b63b3d4598a456955884ba7293": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "427689dd96c94e9e8706872de004600c": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_a4174533851147d0b5452428cfbcb4d0", "IPY_MODEL_e4cc83f17f3941a69be2c5a328c1df7c", "IPY_MODEL_9f696bef605146c088cd39c3e2d41fa7", "IPY_MODEL_4b995377bec241c6b61b7e68518403bb" ], "layout": "IPY_MODEL_59aec4ac781a42709f49465aaa8beda8" } }, "f3d6c5dcb2674419a9cc9d0a78a417ea": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_96c57c6e5c5a446d96664912b2a1f923", "placeholder": "​", "style": "IPY_MODEL_acf63c0070b346f8ba0833bf9d6dbf76", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "1f0e6e0047dc471a91a581505031dcf4": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_72da3e460812424db18da699f1adc68f", "placeholder": "​", "style": "IPY_MODEL_6b9996976aeb46a781c2827c7eb80400", "value": "" } }, "63e88741fa824103a38134ca7f92efc4": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_0f5aa3f5a3fd4869ad22a7d7eb3f4a01", "style": "IPY_MODEL_1a4f383e5b6e4edea22cf8269ac3e136", "value": true } }, "760f19938d4a420fba6b604413bd75c5": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_5e880b14816c40e1ade44bb4e8e3e7a0", "style": "IPY_MODEL_c214e6421b8c444eb91f01729650dfc3", "tooltip": "" } }, "1dc3baf2eb7a4d3981a7a38d574545ea": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5ce845d6ca8e40c5ad7c9d07f7ad271c", "placeholder": "​", "style": "IPY_MODEL_4b7fc4873cc44b9eaf915d4e8bd5e134", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "59aec4ac781a42709f49465aaa8beda8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "96c57c6e5c5a446d96664912b2a1f923": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "acf63c0070b346f8ba0833bf9d6dbf76": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "72da3e460812424db18da699f1adc68f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6b9996976aeb46a781c2827c7eb80400": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0f5aa3f5a3fd4869ad22a7d7eb3f4a01": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1a4f383e5b6e4edea22cf8269ac3e136": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5e880b14816c40e1ade44bb4e8e3e7a0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c214e6421b8c444eb91f01729650dfc3": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "5ce845d6ca8e40c5ad7c9d07f7ad271c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4b7fc4873cc44b9eaf915d4e8bd5e134": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c8fcc29ea48f44b49ec3547067348bc2": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_665705e3373b4a2093bd921a2341bf99", "IPY_MODEL_e890ec5fafb344609f506f16be57d799", "IPY_MODEL_92302f6d76014a778c6cb7370d2f114c" ], "layout": "IPY_MODEL_a1652fbe06094b51994fc22f16f9615a" } }, "665705e3373b4a2093bd921a2341bf99": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a122a0fc52604974a77fe18f7dd43c2f", "placeholder": "​", "style": "IPY_MODEL_dfa55c882b3d44c395d89c53af5f68d7", "value": "btlm-3b.ggml.bin: 100%" } }, "e890ec5fafb344609f506f16be57d799": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fac01390a0634c0caa885e4626fda033", "max": 5557636513, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_6c8e3fe6bca1438a879a0ee18abc5294", "value": 5557636513 } }, "92302f6d76014a778c6cb7370d2f114c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d9ffd07e99cb4a2da0966f653351e6cf", "placeholder": "​", "style": "IPY_MODEL_0005a3238bb84c619e657100845f9c84", "value": " 5.56G/5.56G [03:45<00:00, 20.9MB/s]" } }, "a1652fbe06094b51994fc22f16f9615a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a122a0fc52604974a77fe18f7dd43c2f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dfa55c882b3d44c395d89c53af5f68d7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fac01390a0634c0caa885e4626fda033": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6c8e3fe6bca1438a879a0ee18abc5294": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "d9ffd07e99cb4a2da0966f653351e6cf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0005a3238bb84c619e657100845f9c84": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b7ace70b606943059acf751b89b53291": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b6e56ddf9da04a52b99de60c9e5a3f57", "placeholder": "​", "style": "IPY_MODEL_55b11341f930432282c18b908e8c7c4a", "value": "Connecting..." } }, "b6e56ddf9da04a52b99de60c9e5a3f57": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "55b11341f930432282c18b908e8c7c4a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a4174533851147d0b5452428cfbcb4d0": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b7e2a45a24724bc79db6b055ff3c6e90", "placeholder": "​", "style": "IPY_MODEL_1b2fb7402b6d4bdcbe38e3e1e9179b74", "value": "Token is valid (permission: write)." } }, "e4cc83f17f3941a69be2c5a328c1df7c": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_51921dbb33f445f7ab72666ecac3f85c", "placeholder": "​", "style": "IPY_MODEL_e804d48983884eb7bfd85123482317f7", "value": "Your token has been saved in your configured git credential helpers (store)." } }, "9f696bef605146c088cd39c3e2d41fa7": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a15221a1538c43c7ac162324644f2c2d", "placeholder": "​", "style": "IPY_MODEL_baac4e68e10c4e7bbffeff26b066f308", "value": "Your token has been saved to /root/.cache/huggingface/token" } }, "4b995377bec241c6b61b7e68518403bb": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_04ed18f2ff804865af93d4d73b347d77", "placeholder": "​", "style": "IPY_MODEL_603ecce9c961468eb90510fe0587bc32", "value": "Login successful" } }, "b7e2a45a24724bc79db6b055ff3c6e90": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1b2fb7402b6d4bdcbe38e3e1e9179b74": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "51921dbb33f445f7ab72666ecac3f85c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e804d48983884eb7bfd85123482317f7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a15221a1538c43c7ac162324644f2c2d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "baac4e68e10c4e7bbffeff26b066f308": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "04ed18f2ff804865af93d4d73b347d77": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "603ecce9c961468eb90510fe0587bc32": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5tqWiYIiWl96", "outputId": "07b855ab-a412-4af7-d36c-de2cc77810c3" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting transformers\n", " Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m26.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m83.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m70.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.7.1)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Installing collected packages: tokenizers, safetensors, huggingface-hub, transformers\n", "Successfully installed huggingface-hub-0.16.4 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.31.0\n", "Collecting accelerate\n", " Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.2/244.2 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (23.1)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0.1)\n", "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.0.1+cu118)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.12.2)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (4.7.1)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.11.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (3.25.2)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (16.0.6)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.3)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n", "Installing collected packages: accelerate\n", "Successfully installed accelerate-0.21.0\n", "Collecting bitsandbytes\n", " Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: bitsandbytes\n", "Successfully installed bitsandbytes-0.41.1\n" ] } ], "source": [ "!pip install -U transformers\n", "!pip install -U accelerate #git+https://github.com/huggingface/accelerate.git\n", "!pip install -U bitsandbytes #git+ https://github.com/timdettmers/bitsandbytes.git" ] }, { "cell_type": "code", "source": [ "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "# Load the tokenizer and model\n", "tokenizer = AutoTokenizer.from_pretrained(\"cerebras/btlm-3b-8k-base\")\n", "model = AutoModelForCausalLM.from_pretrained(\n", " \"cerebras/btlm-3b-8k-base\",\n", " trust_remote_code=True,\n", " torch_dtype=\"auto\",\n", " load_in_8bit=True,\n", " offload_folder=\"offload\",\n", " )\n", "\n", "# Set the prompt for generating text\n", "prompt = \"Albert Einstein was known for \"\n", "\n", "# Tokenize the prompt and convert to PyTorch tensors\n", "inputs = tokenizer(prompt, return_tensors=\"pt\")\n", "\n", "# Generate text using the model\n", "outputs = model.generate(\n", " **inputs,\n", " num_beams=5,\n", " max_new_tokens=50,\n", " early_stopping=True,\n", " no_repeat_ngram_size=2\n", ")\n", "\n", "# Convert the generated token IDs back to text\n", "generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", "\n", "# Print the generated text\n", "print(generated_text[0])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 603, "referenced_widgets": [ "114bfeb2ab21477397ff356a8d50d678", "6c2a683959b74f06b71a21f62dc9d732", "bfd850f32c4748cb9fc21e3f67a1e079", "5405a44a0740459c9b2dc317f7e32af2", "5ced55e2613f4ca09c576df4936d444d", "3d205afcaa37416dbafa166e7f6bb6ba", "25969d182a7a4b569625efbb6e619bab", "29c82e0054084c54bb58af9c3518017e", "f969ec4a051d4a5ba29d285ed3f7f8f2", "c3f0dd842604416bb3fdfb40d79d57c1", "85d7256713914254b5ba39ef2eee37bb", "9826e15254ca413eb37dc7f8837f6827", "8d9500746f61474e81bcf51beaba54e1", "41b89216ec9544af91b8b7f7e2f4664e", "1538103aa26d41459b67843f917c5a1a", "48bd08d79550483f84b241127e31878c", "b4f2ddbbf9de4898a7830c79ca2e20d7", "c9ddb75f29af47b38f79b09792116673", "6f3eba08a72348df89642e7eee1fec30", "4ae0e50ecd7f424dac6c9c2c5aa62696", "a71e6379e60c46b5915f0ecd097a2a33", "0a780d58219d46978fad719257430d5f", "e685a0136ffa4280b0a49263c4f842d9", "845a84b8f7344a468e0b0b72f0708378", "a56823a90ee64b8fb87b2d2ba8857e53", "b6698a0e0ca2409daf771b0d0a419e3a", "c8ac228076fa4fb88ff37e2f3c34be93", "f16775e4439a4c4f886c4c4027d30832", "3b40ee2e5b60499e81f8c79a6383b2ec", "9a1f2210fd1e47d3ba66cf0a99f937b8", "fc2aa788745848c8b363c44e7d07116b", "d9ea0947db244309b8effbfe53267042", "f52cccc762dc4c419671b2aade6a3506", "9c7c0a7882964d8ca8031ac18af93253", "f5f57b4e23174d0fa8142dcb57207720", "50f474df9ab147f7ba094c1cdb2ba69e", "35c204e247044a11a9ee871cfbcbe430", "8596f49e4d594ae4a0e848608c7d122f", "874be31d42ce487697133b07065f815d", "41d87a1c6a6a4d32ac20f35f779207d5", "b7ab4be64f8b487893997a5d57eb9f09", "8620138609a24a3aae22da774fb32bdb", "a76b7ee703ed4ed08a1e0e1a708aa117", "44aa877b3a844922a1ef50809713fe32", "21f3adc7a34149d88b0e67b537eec488", "61c7dc85009b4ab0968b8222a49297a1", "194d14e6d61f47a28f7615f0c0071831", "1e5eea17dab34a18a3e8fafe6988ad90", "630b9cb70bed4900b97cfa3c476d6ca0", "7d0e80dbc0dc4431ba3eacc2ed719e2e", "46561fc96ed645e1bfc4039a222b2f66", "162e24639a1d462eaa67e8e8040ca879", "9ab05a9fc9134fe19ef600b8cde42b78", "b80761f44a4e4c889e3acfa06852c064", "713e206a8c894bcf965cb96a97f30be7", "85a1329fbf9e44c8b708a1ad868691d7", "9be2d523ce7e4c2daeed4f4846f16cff", "3321f3d71e684ca4afa48a1e0b889ebd", "adb1ff19806c4d9abfef424270408759", "d7a5976a0b6b4a62913bdf148d06a6bf", "efed82b2f6f84516989d07f9515a102d", "4f6b53cc236b448093d23240bd28a6d8", "b7821089bfe94594982736d3005b82e9", "d6f016012139482b9ffc48407a0e7cb1", "eee7b8b660f74f0e9e0634a17b29ecb1", "6296781e8cc94316a44c5a0a0a77f46c", "0c4007f367744ccf80f3c9af09af53de", "87a60648650a41fa9b3573be98e4adad", "0cbd3f72373544b8a946c70627e515f9", "d0f8fa0a331b4da98c0b55311ea073a0", "a312c5e9461d470ca8c3cd2184a7ee09", "8980f61b2d4b4164a21b596ce515f6e3", "d58f9dcf2c7b4fa887702b231ba0ddd1", "36cafb73c9fd4bb486853771b146fb44", "a31d4ba8182a41f7a164a87e0492bfe2", "48584e07c5074502acc46680e17b6dd8", "7731d185d2d0401c9ce8dd045ffc187a", "b5f78d5cabff40ca8749376141ea8d23", "8e4700723e894b7380862d08a34afe9d", "3b1d6cbf8b2b4332a3fdbe2a5a755116", "5f52ddd679124bbfb141e10103b5ac42", "289ac602411348498cd757b2a6b53a49", "fdfb46b541434685b1b0cf11c2fb8597", "c45db1178d344416b5a107048f482a3e", "b8ffc780321f405592a9112487316d31", "b24b1532289f4ed99e3b645248aff802", "6d924d2d6820449daf637270470fa50b", "ca9129ae04054d419c5a2c5b8f5b8364", "8bf00751b22e4aae8bb5e715940dea8f", "b996c1623b584cdeaa79a68fcb3f6535", "53b6219000d541a2ab09b807084e33da", "2714c5ad04fb4621baf8affc9795027a", "d6200cca0bab43e9b24a86be582833d9", "a8085708e61f4f08a9f90b881d957e88", "cc839ad647d94e2fac9bffb8e3e1c421", "0eab7e2aafd14b96bb601b11fbf6af59", "bff03d7ae4f04a55b0ba015fe9b70c54", "c95ee59f813e4ccaad4c845fe2c67512", "823526b63b3d4598a456955884ba7293" ] }, "id": "PUU3t5QeXhTM", "outputId": "0829a1e4-9ea6-4dbd-89f2-59504837dc95" }, "execution_count": 2, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/234 [00:00\", list_vars[name])\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IP0eQEnnYPiX", "outputId": "54e56c69-0741-4669-d97c-6b5a5a518940" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "transformer.wte.weight => tensor([[ 0.0200, 0.0442, 0.0562, ..., 0.0173, -0.0238, -0.0889],\n", " [-0.0259, 0.0170, -0.0221, ..., -0.0752, -0.0635, 0.0947],\n", " [-0.0276, 0.1846, 0.1533, ..., -0.0195, 0.0299, 0.0796],\n", " ...,\n", " [ 0.1182, 0.1523, 0.0742, ..., -0.1162, 0.0177, 0.0991],\n", " [ 0.0220, -0.0579, 0.0125, ..., -0.0576, 0.0327, 0.0211],\n", " [ 0.0508, -0.0217, 0.0278, ..., -0.0308, -0.0378, 0.0013]],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.ln_1.weight => tensor([0.6445, 0.7344, 0.6133, ..., 0.6484, 0.7070, 0.7148], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.0.ln_1.bias => tensor([ 0.0288, 0.0082, -0.0771, ..., 0.0284, -0.0391, 0.0233],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.attn.c_attn.weight => tensor([[ 15, 3, -15, ..., 67, 40, 29],\n", " [ -4, 43, 26, ..., 9, 15, 19],\n", " [ 41, 33, 6, ..., -20, -27, -35],\n", " ...,\n", " [ 27, -51, 6, ..., -1, 76, -31],\n", " [-17, 3, 20, ..., -55, 15, -58],\n", " [ 40, 29, 2, ..., 39, 39, -19]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.0.attn.c_attn.bias => tensor([-0.0408, 0.0349, 0.0292, ..., -0.0286, -0.7773, -0.0108],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.attn.c_attn.SCB => tensor([0.0693, 0.0830, 0.0669, ..., 0.0708, 0.1543, 0.0742], device='cuda:0')\n", "transformer.h.0.attn.c_attn.weight_format => col_turing\n", "transformer.h.0.attn.c_proj.weight => tensor([[ 12, -39, -38, ..., 68, -45, -30],\n", " [ 52, -20, -48, ..., 59, -32, 70],\n", " [ 35, 71, -81, ..., -29, 13, -66],\n", " ...,\n", " [-11, 3, 44, ..., -38, -83, 30],\n", " [ -6, -8, -15, ..., 30, 3, -32],\n", " [ 5, 52, 31, ..., 43, -30, -8]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.0.attn.c_proj.bias => tensor([ 0.5938, -0.7500, -0.0056, ..., -0.4355, 0.4648, 0.4414],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.attn.c_proj.SCB => tensor([0.1226, 0.1001, 0.0830, ..., 0.1030, 0.0938, 0.0688], device='cuda:0')\n", "transformer.h.0.attn.c_proj.weight_format => col_turing\n", "transformer.h.0.ln_2.weight => tensor([0.5117, 0.5352, 0.5156, ..., 0.5078, 0.5195, 0.5391], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.0.ln_2.bias => tensor([-0.1426, 0.1338, 0.0157, ..., 0.0571, -0.0947, -0.1455],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.mlp.c_fc.weight => tensor([[ 17, 71, -105, ..., 33, 15, 9],\n", " [ -15, 13, 6, ..., -23, 66, -86],\n", " [ -21, -37, -62, ..., -17, 24, -50],\n", " ...,\n", " [ 42, 15, 1, ..., -20, 18, 24],\n", " [ 10, -15, 14, ..., -20, -1, -1],\n", " [ 2, -1, -2, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.0.mlp.c_fc.bias => tensor([ 0.1357, -0.0933, 0.0977, ..., -0.3086, -0.0693, 0.0530],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.mlp.c_fc.SCB => tensor([0.1245, 0.1108, 0.1602, ..., 0.1060, 0.1113, 0.1113], device='cuda:0')\n", "transformer.h.0.mlp.c_fc.weight_format => col_turing\n", "transformer.h.0.mlp.c_fc2.weight => tensor([[ 17, -43, -8, ..., -8, 67, 64],\n", " [ 25, -40, -8, ..., 8, -30, 61],\n", " [-59, 44, -16, ..., -37, -50, 25],\n", " ...,\n", " [-24, -42, -3, ..., -40, 20, 33],\n", " [ 21, 25, -10, ..., 37, 54, -11],\n", " [ 32, 16, 17, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.0.mlp.c_fc2.bias => tensor([-0.6328, -0.6250, 0.1611, ..., -0.2500, -0.8984, -0.8008],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.mlp.c_fc2.SCB => tensor([0.1172, 0.1069, 0.1060, ..., 0.1367, 0.1299, 0.1289], device='cuda:0')\n", "transformer.h.0.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.0.mlp.c_proj.weight => tensor([[ 32, 43, 34, ..., -48, -28, 103],\n", " [-20, -35, -21, ..., 66, 21, 23],\n", " [-32, 19, -11, ..., -2, -24, -1],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ -6, -1, -15, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.0.mlp.c_proj.bias => tensor([-0.7422, 0.3691, -0.2695, ..., 0.2910, 0.3066, -0.3066],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.0.mlp.c_proj.SCB => tensor([0.1328, 0.1484, 0.1279, ..., 0.1270, 0.1299, 0.1216], device='cuda:0')\n", "transformer.h.0.mlp.c_proj.weight_format => col_turing\n", "transformer.h.1.ln_1.weight => tensor([0.8438, 0.9102, 0.8672, ..., 0.8750, 0.8672, 0.9648], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.1.ln_1.bias => tensor([ 0.0544, 0.0525, 0.0654, ..., -0.0067, -0.1738, -0.0200],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.attn.c_attn.weight => tensor([[ 50, -3, -9, ..., 24, -10, 2],\n", " [ -27, -29, -4, ..., -17, -17, 31],\n", " [ -31, 19, 40, ..., -20, -12, -37],\n", " ...,\n", " [ 98, -108, -8, ..., 2, 9, 3],\n", " [ -11, -60, -9, ..., -19, -48, 23],\n", " [ 24, -21, 14, ..., -7, -14, 55]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.1.attn.c_attn.bias => tensor([-1.4844, 0.0466, -0.4785, ..., -0.0747, -0.0608, -0.4160],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.attn.c_attn.SCB => tensor([0.1465, 0.1328, 0.1279, ..., 0.1050, 0.0933, 0.0674], device='cuda:0')\n", "transformer.h.1.attn.c_attn.weight_format => col_turing\n", "transformer.h.1.attn.c_proj.weight => tensor([[ 39, -61, -16, ..., 37, -11, 3],\n", " [ 51, -58, 14, ..., 2, 45, 25],\n", " [ 51, 42, -3, ..., -51, 17, -27],\n", " ...,\n", " [ 14, 14, -23, ..., 14, 16, 9],\n", " [ 12, 34, 16, ..., 4, 24, -13],\n", " [-17, 18, 13, ..., 62, -13, -39]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.1.attn.c_proj.bias => tensor([ 0.1816, 0.0825, -0.3301, ..., 0.1484, -0.1641, 0.5312],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.attn.c_proj.SCB => tensor([0.1260, 0.0879, 0.0796, ..., 0.0869, 0.0908, 0.0791], device='cuda:0')\n", "transformer.h.1.attn.c_proj.weight_format => col_turing\n", "transformer.h.1.ln_2.weight => tensor([0.5352, 0.6094, 0.5977, ..., 0.5625, 0.6016, 0.6211], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.1.ln_2.bias => tensor([ 0.0344, 0.0225, 0.0618, ..., -0.0493, 0.0116, -0.1426],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.mlp.c_fc.weight => tensor([[ 11, 56, 8, ..., 20, 38, -61],\n", " [-41, -37, 26, ..., 46, -10, 4],\n", " [ 29, 20, -29, ..., -8, 15, 75],\n", " ...,\n", " [ 53, 66, -38, ..., 16, 16, 8],\n", " [-44, 5, 6, ..., -12, -37, 10],\n", " [-29, -33, -32, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.1.mlp.c_fc.bias => tensor([ 0.2041, -0.1729, 0.1299, ..., 0.2246, -0.1216, 0.1445],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.mlp.c_fc.SCB => tensor([0.1680, 0.1328, 0.2500, ..., 0.1206, 0.1167, 0.1128], device='cuda:0')\n", "transformer.h.1.mlp.c_fc.weight_format => col_turing\n", "transformer.h.1.mlp.c_fc2.weight => tensor([[ -13, -66, -31, ..., 47, 11, 24],\n", " [ -15, -7, 15, ..., -13, 10, 6],\n", " [ 7, -54, 14, ..., -32, -6, -57],\n", " ...,\n", " [ -16, -40, -52, ..., 51, -24, 12],\n", " [ 16, 17, -14, ..., -14, 6, -10],\n", " [ -76, -2, -109, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.1.mlp.c_fc2.bias => tensor([-0.9844, -0.8047, -0.4961, ..., -0.6406, -0.7461, -0.4609],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.mlp.c_fc2.SCB => tensor([0.1235, 0.1172, 0.1631, ..., 0.1099, 0.1177, 0.1055], device='cuda:0')\n", "transformer.h.1.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.1.mlp.c_proj.weight => tensor([[ 7, -15, 3, ..., 25, -45, 40],\n", " [ 5, 45, 28, ..., -14, 12, -15],\n", " [ 16, 4, -31, ..., -17, -33, -15],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 7, -21, 40, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.1.mlp.c_proj.bias => tensor([ 0.3750, 0.1338, 0.1177, ..., -0.1943, 0.3184, -0.3828],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.1.mlp.c_proj.SCB => tensor([0.1475, 0.1406, 0.1328, ..., 0.1318, 0.1279, 0.1504], device='cuda:0')\n", "transformer.h.1.mlp.c_proj.weight_format => col_turing\n", "transformer.h.2.ln_1.weight => tensor([0.9844, 1.0859, 1.0703, ..., 0.9453, 0.9258, 1.0156], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.2.ln_1.bias => tensor([-0.0258, 0.1226, 0.0175, ..., -0.0105, -0.0170, -0.0311],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.attn.c_attn.weight => tensor([[ 41, -32, -41, ..., -43, -37, -50],\n", " [ -3, 37, -51, ..., 14, -35, -33],\n", " [ 33, -19, 6, ..., -52, 49, 6],\n", " ...,\n", " [-14, -67, -14, ..., -25, 60, 16],\n", " [-44, -14, 50, ..., 33, -5, -64],\n", " [-22, -4, 0, ..., 50, -33, -9]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.2.attn.c_attn.bias => tensor([-0.9570, 0.2344, 0.7422, ..., 0.0215, -0.4238, -1.2500],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.attn.c_attn.SCB => tensor([0.1318, 0.1396, 0.1777, ..., 0.0535, 0.0771, 0.0693], device='cuda:0')\n", "transformer.h.2.attn.c_attn.weight_format => col_turing\n", "transformer.h.2.attn.c_proj.weight => tensor([[ 37, -32, -41, ..., -33, 14, 26],\n", " [ 21, -14, -4, ..., -38, 4, 47],\n", " [ 58, 72, 24, ..., -7, -47, -67],\n", " ...,\n", " [ 7, 11, 44, ..., 21, 37, -47],\n", " [ 4, 6, 4, ..., -11, -35, -59],\n", " [ 12, 37, 12, ..., 8, 7, -31]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.2.attn.c_proj.bias => tensor([ 0.1602, 0.1592, -0.1699, ..., -0.0430, 0.1406, -0.5312],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.attn.c_proj.SCB => tensor([0.0811, 0.0776, 0.0967, ..., 0.0889, 0.0825, 0.0898], device='cuda:0')\n", "transformer.h.2.attn.c_proj.weight_format => col_turing\n", "transformer.h.2.ln_2.weight => tensor([0.4805, 0.6445, 0.6719, ..., 0.5781, 0.6445, 0.6992], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.2.ln_2.bias => tensor([-0.0011, 0.0074, 0.0491, ..., 0.0035, -0.0540, 0.1182],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.mlp.c_fc.weight => tensor([[ 7, -15, 36, ..., 51, -84, -22],\n", " [ 5, -9, -56, ..., 23, 6, 2],\n", " [ 23, 46, 77, ..., 7, 11, 31],\n", " ...,\n", " [ -9, 4, 1, ..., -25, 9, 17],\n", " [ 44, -10, -4, ..., 1, 37, 9],\n", " [-49, 35, 11, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.2.mlp.c_fc.bias => tensor([-0.0601, -0.0957, 0.2500, ..., 0.0879, -0.0811, 0.0053],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.mlp.c_fc.SCB => tensor([0.1162, 0.1187, 0.1104, ..., 0.1216, 0.1235, 0.1133], device='cuda:0')\n", "transformer.h.2.mlp.c_fc.weight_format => col_turing\n", "transformer.h.2.mlp.c_fc2.weight => tensor([[ 8, 28, 23, ..., -5, 59, -2],\n", " [ 2, 22, 33, ..., -1, -3, 3],\n", " [-27, -70, -45, ..., 28, 39, -10],\n", " ...,\n", " [ -2, -59, -3, ..., -59, -45, -6],\n", " [-14, -29, 35, ..., 4, 57, 5],\n", " [ 5, 32, -1, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.2.mlp.c_fc2.bias => tensor([-1.0625, -0.9023, -0.6836, ..., -1.0234, -1.0156, -1.1094],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.mlp.c_fc2.SCB => tensor([0.1226, 0.1206, 0.1021, ..., 0.1094, 0.1138, 0.1279], device='cuda:0')\n", "transformer.h.2.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.2.mlp.c_proj.weight => tensor([[ -1, 18, 22, ..., 0, 20, 61],\n", " [-11, -18, 24, ..., 10, 11, -70],\n", " [ 21, 51, -18, ..., 46, 36, -29],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-64, -21, 17, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.2.mlp.c_proj.bias => tensor([-0.2051, 0.8125, -0.2949, ..., 0.0204, -0.0291, 0.2002],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.2.mlp.c_proj.SCB => tensor([0.1602, 0.1523, 0.1318, ..., 0.1475, 0.1357, 0.1367], device='cuda:0')\n", "transformer.h.2.mlp.c_proj.weight_format => col_turing\n", "transformer.h.3.ln_1.weight => tensor([1.1484, 1.2812, 1.3047, ..., 1.1016, 1.0781, 1.1406], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.3.ln_1.bias => tensor([ 0.0884, -0.2637, 0.1914, ..., -0.0078, -0.0703, 0.0542],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.attn.c_attn.weight => tensor([[ 7, 9, 25, ..., 62, 20, 42],\n", " [ -6, -1, 41, ..., 29, 32, 22],\n", " [ -6, -75, 33, ..., -11, 127, 33],\n", " ...,\n", " [ 20, 3, -22, ..., 34, 5, -45],\n", " [ -7, 20, -15, ..., -20, -57, -7],\n", " [ 43, 6, -12, ..., -46, -5, -38]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.3.attn.c_attn.bias => tensor([-0.2754, 0.3555, -0.2090, ..., 0.0835, 0.0564, -0.0864],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.attn.c_attn.SCB => tensor([0.1660, 0.1318, 0.1289, ..., 0.1030, 0.0967, 0.1069], device='cuda:0')\n", "transformer.h.3.attn.c_attn.weight_format => col_turing\n", "transformer.h.3.attn.c_proj.weight => tensor([[ 8, 16, 33, ..., -18, -5, -30],\n", " [ 34, 66, -33, ..., -20, 44, 21],\n", " [ 25, -29, 53, ..., -6, 65, 17],\n", " ...,\n", " [ -7, -2, -10, ..., -36, 10, -5],\n", " [-29, 39, 5, ..., 27, 37, 45],\n", " [-35, 31, -15, ..., 26, 14, -25]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.3.attn.c_proj.bias => tensor([ 0.1338, 0.2637, 0.4180, ..., -0.0247, 0.0996, -0.3008],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.attn.c_proj.SCB => tensor([0.1191, 0.0879, 0.0952, ..., 0.0786, 0.0952, 0.1001], device='cuda:0')\n", "transformer.h.3.attn.c_proj.weight_format => col_turing\n", "transformer.h.3.ln_2.weight => tensor([0.4980, 0.7227, 0.7617, ..., 0.6445, 0.6953, 0.8242], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.3.ln_2.bias => tensor([-0.0168, -0.1777, 0.0124, ..., -0.0107, -0.0466, 0.1289],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.mlp.c_fc.weight => tensor([[-22, 9, -1, ..., 38, -35, -7],\n", " [ -1, 6, -7, ..., -39, 16, -79],\n", " [-29, -4, 11, ..., -68, 25, -1],\n", " ...,\n", " [-35, -47, 8, ..., -13, -39, -14],\n", " [-28, -4, -20, ..., 53, -19, -26],\n", " [ 10, -43, -23, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.3.mlp.c_fc.bias => tensor([-0.0199, 0.0125, 0.0427, ..., -0.0620, 0.0303, -0.0175],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.mlp.c_fc.SCB => tensor([0.1230, 0.1118, 0.1069, ..., 0.1396, 0.1396, 0.1191], device='cuda:0')\n", "transformer.h.3.mlp.c_fc.weight_format => col_turing\n", "transformer.h.3.mlp.c_fc2.weight => tensor([[ 45, 49, -12, ..., 41, 37, -33],\n", " [ 3, 6, -93, ..., -37, 20, -11],\n", " [ 24, -30, -35, ..., -12, -6, 12],\n", " ...,\n", " [ 89, -20, -40, ..., -3, -12, -18],\n", " [-37, -15, 3, ..., 45, 76, -48],\n", " [-18, 74, -7, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.3.mlp.c_fc2.bias => tensor([-1.5625, -0.8984, -1.1484, ..., -1.5000, -1.2188, -1.3203],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.mlp.c_fc2.SCB => tensor([0.1230, 0.1089, 0.1133, ..., 0.1138, 0.1074, 0.1201], device='cuda:0')\n", "transformer.h.3.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.3.mlp.c_proj.weight => tensor([[ 51, -11, 24, ..., 4, 10, 26],\n", " [ 10, -25, -3, ..., 23, -21, -5],\n", " [-35, 0, 3, ..., -9, 31, 17],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ -2, 40, -21, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.3.mlp.c_proj.bias => tensor([-0.4141, 0.7773, 0.6016, ..., 0.0864, -0.4141, -0.0967],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.3.mlp.c_proj.SCB => tensor([0.1338, 0.1133, 0.1187, ..., 0.1406, 0.1377, 0.1426], device='cuda:0')\n", "transformer.h.3.mlp.c_proj.weight_format => col_turing\n", "transformer.h.4.ln_1.weight => tensor([1.3516, 1.4766, 1.5000, ..., 1.3438, 1.2812, 1.3828], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.4.ln_1.bias => tensor([ 0.0056, -0.6016, -0.2295, ..., -0.0483, 0.0371, 0.1641],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.attn.c_attn.weight => tensor([[ -1, -13, -19, ..., 11, 13, -8],\n", " [-42, 23, 48, ..., -6, -60, -11],\n", " [-15, -29, 15, ..., -27, 4, 12],\n", " ...,\n", " [-38, -62, 3, ..., 35, 19, 32],\n", " [-14, 58, 56, ..., -63, 20, 28],\n", " [ 34, 31, 4, ..., 73, -7, 15]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.4.attn.c_attn.bias => tensor([-0.6445, 1.0391, 1.3516, ..., -0.1641, -0.0425, 0.1270],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.attn.c_attn.SCB => tensor([0.1504, 0.1396, 0.1494, ..., 0.1089, 0.0967, 0.1245], device='cuda:0')\n", "transformer.h.4.attn.c_attn.weight_format => col_turing\n", "transformer.h.4.attn.c_proj.weight => tensor([[ 16, -17, -3, ..., 77, -15, -28],\n", " [ 0, -8, -24, ..., -9, 8, 4],\n", " [ 27, -5, -7, ..., 26, -20, 6],\n", " ...,\n", " [ 43, -3, 53, ..., 28, 13, -59],\n", " [ 22, -37, -6, ..., 91, -30, -62],\n", " [ 75, 48, -33, ..., -13, 16, -62]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.4.attn.c_proj.bias => tensor([ 0.1543, -0.2314, 0.0996, ..., -0.0688, 0.0140, 0.3613],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.attn.c_proj.SCB => tensor([0.1177, 0.0918, 0.0933, ..., 0.0835, 0.0986, 0.1074], device='cuda:0')\n", "transformer.h.4.attn.c_proj.weight_format => col_turing\n", "transformer.h.4.ln_2.weight => tensor([0.4961, 0.7656, 0.7812, ..., 0.6680, 0.7266, 0.8125], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.4.ln_2.bias => tensor([-0.0168, -0.2344, -0.1177, ..., 0.0061, 0.0148, 0.0334],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.mlp.c_fc.weight => tensor([[-33, -36, 18, ..., 60, 33, 41],\n", " [ 11, 44, -26, ..., 27, -63, 55],\n", " [ 60, 19, 28, ..., -7, -25, 55],\n", " ...,\n", " [-22, -60, -9, ..., -53, 5, -21],\n", " [ 0, -33, 32, ..., -44, 37, 16],\n", " [ 21, -16, 9, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.4.mlp.c_fc.bias => tensor([-0.1846, -0.1758, 0.1309, ..., 0.1758, 0.0801, -0.0393],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.mlp.c_fc.SCB => tensor([0.1211, 0.1104, 0.1216, ..., 0.1738, 0.1133, 0.1167], device='cuda:0')\n", "transformer.h.4.mlp.c_fc.weight_format => col_turing\n", "transformer.h.4.mlp.c_fc2.weight => tensor([[ -3, -2, -37, ..., 46, -22, -17],\n", " [ 24, -10, -69, ..., -44, -21, -9],\n", " [-20, 52, 6, ..., -41, -76, 29],\n", " ...,\n", " [ 37, -9, -5, ..., -5, 17, -23],\n", " [ 34, -28, -2, ..., 34, 13, -15],\n", " [-31, -7, -30, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.4.mlp.c_fc2.bias => tensor([-0.8750, -0.9336, -0.9102, ..., 0.0211, -1.1250, -1.1250],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.mlp.c_fc2.SCB => tensor([0.1279, 0.1138, 0.1045, ..., 0.1357, 0.1279, 0.1094], device='cuda:0')\n", "transformer.h.4.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.4.mlp.c_proj.weight => tensor([[ -4, -20, 5, ..., -29, -13, 31],\n", " [-10, -2, -39, ..., 31, -37, 28],\n", " [ 10, -67, -3, ..., 37, -27, 3],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ -8, 55, 23, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.4.mlp.c_proj.bias => tensor([-0.0972, 0.4922, -0.0957, ..., 0.0039, 0.0359, -0.2139],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.4.mlp.c_proj.SCB => tensor([0.1533, 0.1299, 0.1260, ..., 0.1348, 0.1279, 0.1187], device='cuda:0')\n", "transformer.h.4.mlp.c_proj.weight_format => col_turing\n", "transformer.h.5.ln_1.weight => tensor([1.3906, 1.6406, 1.6562, ..., 1.4297, 1.4219, 1.5156], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.5.ln_1.bias => tensor([-0.0089, -0.6406, -0.2188, ..., -0.0320, 0.0225, 0.1406],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.attn.c_attn.weight => tensor([[-58, 102, -14, ..., 48, 21, -3],\n", " [-14, 9, 1, ..., 4, 24, -21],\n", " [-43, -19, 27, ..., -7, 24, -12],\n", " ...,\n", " [ -6, -17, -14, ..., -14, 15, -12],\n", " [ 14, 8, 6, ..., 44, 21, 9],\n", " [ -5, -18, 59, ..., -73, -25, -35]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.5.attn.c_attn.bias => tensor([-0.9922, -0.4434, -0.2812, ..., -0.1113, 0.0322, -0.0303],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.attn.c_attn.SCB => tensor([0.1709, 0.1855, 0.1670, ..., 0.1182, 0.1250, 0.1250], device='cuda:0')\n", "transformer.h.5.attn.c_attn.weight_format => col_turing\n", "transformer.h.5.attn.c_proj.weight => tensor([[-20, 4, 7, ..., -11, -50, 27],\n", " [ 22, -24, -15, ..., -23, 20, 1],\n", " [ -5, -1, 32, ..., 30, -11, -32],\n", " ...,\n", " [-18, -16, 0, ..., 51, -38, 15],\n", " [ 35, -23, -50, ..., 44, -30, 46],\n", " [ 2, -17, -34, ..., 44, 58, -26]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.5.attn.c_proj.bias => tensor([-0.0786, 0.2441, 0.1445, ..., -0.0796, 0.0288, -0.1514],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.attn.c_proj.SCB => tensor([0.1089, 0.1021, 0.1108, ..., 0.0845, 0.1147, 0.1006], device='cuda:0')\n", "transformer.h.5.attn.c_proj.weight_format => col_turing\n", "transformer.h.5.ln_2.weight => tensor([0.5195, 0.8477, 0.8867, ..., 0.7070, 0.7852, 0.9453], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.5.ln_2.bias => tensor([ 0.0144, -0.3477, -0.1094, ..., -0.0109, 0.0287, 0.1045],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.mlp.c_fc.weight => tensor([[ 0, -73, 85, ..., 6, -25, 12],\n", " [ 16, 28, -40, ..., -16, -48, -47],\n", " [-20, -2, 11, ..., 23, -13, -4],\n", " ...,\n", " [ 61, 22, 53, ..., 3, -48, 0],\n", " [ 69, 11, 45, ..., 61, -29, -2],\n", " [ 24, -55, -39, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.5.mlp.c_fc.bias => tensor([-0.1680, -0.1055, 0.2520, ..., 0.4844, 0.0181, 0.0219],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.mlp.c_fc.SCB => tensor([0.1260, 0.1367, 0.1143, ..., 0.1455, 0.1240, 0.1260], device='cuda:0')\n", "transformer.h.5.mlp.c_fc.weight_format => col_turing\n", "transformer.h.5.mlp.c_fc2.weight => tensor([[ 6, 21, -16, ..., 5, -53, 23],\n", " [ 8, -28, 9, ..., 36, 10, 22],\n", " [ 4, -33, -13, ..., 4, 30, -19],\n", " ...,\n", " [ 0, -26, 32, ..., -19, -29, -22],\n", " [ 47, -48, -27, ..., -13, 37, -18],\n", " [-15, -7, 4, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.5.mlp.c_fc2.bias => tensor([-1.0469, -1.1875, -1.8359, ..., 0.3242, -1.4141, -1.2656],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.mlp.c_fc2.SCB => tensor([0.1260, 0.1191, 0.1216, ..., 0.0654, 0.1426, 0.1289], device='cuda:0')\n", "transformer.h.5.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.5.mlp.c_proj.weight => tensor([[ 17, -48, 12, ..., 36, 40, -67],\n", " [ 52, 50, -21, ..., 23, -59, 59],\n", " [-34, 16, 17, ..., -42, -29, 48],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ -6, -47, 12, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.5.mlp.c_proj.bias => tensor([-0.6055, 0.5000, 0.2656, ..., -0.0659, 0.0188, -0.1055],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.5.mlp.c_proj.SCB => tensor([0.1309, 0.1240, 0.1484, ..., 0.1514, 0.1670, 0.1211], device='cuda:0')\n", "transformer.h.5.mlp.c_proj.weight_format => col_turing\n", "transformer.h.6.ln_1.weight => tensor([1.5391, 1.7812, 1.8906, ..., 1.5000, 1.4688, 1.6484], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.6.ln_1.bias => tensor([ 0.0815, -0.7734, -0.3301, ..., -0.0099, 0.0498, 0.1680],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.attn.c_attn.weight => tensor([[ 19, 34, 6, ..., 72, -26, -84],\n", " [-29, 37, -37, ..., -14, -58, 16],\n", " [-21, 38, -1, ..., -79, -19, -28],\n", " ...,\n", " [-39, -4, 7, ..., 29, 18, -64],\n", " [ 20, -7, -10, ..., -3, -55, 42],\n", " [ 47, -9, -26, ..., 78, 6, -27]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.6.attn.c_attn.bias => tensor([ 0.2734, 0.0593, -0.1963, ..., 0.0498, 0.1777, 0.0608],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.attn.c_attn.SCB => tensor([0.1289, 0.1309, 0.1152, ..., 0.0664, 0.1074, 0.0708], device='cuda:0')\n", "transformer.h.6.attn.c_attn.weight_format => col_turing\n", "transformer.h.6.attn.c_proj.weight => tensor([[ 4, 20, -23, ..., -9, -10, 13],\n", " [ 13, -21, -14, ..., 22, 29, 19],\n", " [ 28, 23, 19, ..., -24, 17, 8],\n", " ...,\n", " [-13, -23, 6, ..., -24, 12, -12],\n", " [-57, -2, -29, ..., -25, 50, -8],\n", " [ -5, -9, -9, ..., 7, 33, 19]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.6.attn.c_proj.bias => tensor([ 0.2598, 0.1226, -0.1816, ..., 0.0850, 0.0391, 0.1982],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.attn.c_proj.SCB => tensor([0.1250, 0.0962, 0.0962, ..., 0.0898, 0.0918, 0.1050], device='cuda:0')\n", "transformer.h.6.attn.c_proj.weight_format => col_turing\n", "transformer.h.6.ln_2.weight => tensor([0.5156, 0.8828, 0.8945, ..., 0.7578, 0.8320, 1.0078], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.6.ln_2.bias => tensor([ 0.0312, -0.3965, -0.0923, ..., -0.0024, 0.0253, 0.0718],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.mlp.c_fc.weight => tensor([[-41, 10, 20, ..., 16, 34, 19],\n", " [ 14, 46, -3, ..., -29, 1, -3],\n", " [ 6, 18, 28, ..., 10, 62, -5],\n", " ...,\n", " [ 11, 1, -17, ..., 18, -53, -16],\n", " [ -2, -11, -13, ..., -21, -4, 24],\n", " [-41, 0, -36, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.6.mlp.c_fc.bias => tensor([-0.0194, -0.2656, 0.1445, ..., -0.0496, -0.0115, 0.0579],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.mlp.c_fc.SCB => tensor([0.1289, 0.1387, 0.1611, ..., 0.1270, 0.1309, 0.1318], device='cuda:0')\n", "transformer.h.6.mlp.c_fc.weight_format => col_turing\n", "transformer.h.6.mlp.c_fc2.weight => tensor([[ 20, 44, 13, ..., 25, -6, 55],\n", " [-17, 7, 21, ..., 30, -28, -26],\n", " [-39, 67, 3, ..., 55, 5, -25],\n", " ...,\n", " [ -1, -3, -15, ..., -27, -1, 21],\n", " [ 13, 23, 42, ..., -48, -14, 11],\n", " [ 18, -9, -20, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.6.mlp.c_fc2.bias => tensor([-1.1719, -0.9688, -0.9688, ..., -1.0859, -0.9141, -1.4141],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.mlp.c_fc2.SCB => tensor([0.1133, 0.1206, 0.1177, ..., 0.0972, 0.1030, 0.1230], device='cuda:0')\n", "transformer.h.6.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.6.mlp.c_proj.weight => tensor([[ 43, -58, 10, ..., 9, -35, 3],\n", " [ 36, -22, -33, ..., -54, 5, 45],\n", " [ 58, -4, -7, ..., -44, 31, -43],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 15, 27, -15, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.6.mlp.c_proj.bias => tensor([-0.4629, 0.4902, 0.2930, ..., -0.0162, 0.0447, 0.0059],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.6.mlp.c_proj.SCB => tensor([0.1514, 0.1543, 0.1406, ..., 0.1348, 0.1338, 0.1387], device='cuda:0')\n", "transformer.h.6.mlp.c_proj.weight_format => col_turing\n", "transformer.h.7.ln_1.weight => tensor([1.6094, 1.8906, 2.0625, ..., 1.6719, 1.6719, 1.7656], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.7.ln_1.bias => tensor([ 0.0342, -0.7891, -0.3242, ..., -0.0060, 0.0459, 0.1631],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.attn.c_attn.weight => tensor([[ 18, 18, -57, ..., -33, -5, -22],\n", " [ 8, -57, -4, ..., 33, 14, 40],\n", " [ 12, 43, -27, ..., -25, -8, 13],\n", " ...,\n", " [ 65, 42, -19, ..., 38, -12, -55],\n", " [ 0, -38, -47, ..., 24, 11, 5],\n", " [ 24, 63, 9, ..., 21, 4, 40]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.7.attn.c_attn.bias => tensor([ 0.2070, 0.5586, -1.0469, ..., 0.3203, 0.1191, 0.0442],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.attn.c_attn.SCB => tensor([0.1367, 0.1201, 0.1396, ..., 0.1201, 0.1416, 0.1289], device='cuda:0')\n", "transformer.h.7.attn.c_attn.weight_format => col_turing\n", "transformer.h.7.attn.c_proj.weight => tensor([[ 28, 1, -22, ..., 18, 0, -13],\n", " [ -2, -1, -12, ..., -14, -20, -37],\n", " [ 0, -1, -29, ..., 6, -10, -9],\n", " ...,\n", " [ 14, -22, 28, ..., 4, -44, 34],\n", " [ -14, -109, 1, ..., 74, -17, 23],\n", " [ 22, -51, 5, ..., -7, 11, -17]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.7.attn.c_proj.bias => tensor([-0.1699, 0.4590, 0.2256, ..., 0.1187, -0.0059, -0.0649],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.attn.c_proj.SCB => tensor([0.1221, 0.1021, 0.0928, ..., 0.0908, 0.0908, 0.1025], device='cuda:0')\n", "transformer.h.7.attn.c_proj.weight_format => col_turing\n", "transformer.h.7.ln_2.weight => tensor([0.5117, 0.8906, 0.9062, ..., 0.7422, 0.8203, 0.9961], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.7.ln_2.bias => tensor([ 0.0258, -0.3945, -0.1436, ..., -0.0069, 0.0115, 0.1074],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.mlp.c_fc.weight => tensor([[ 12, -30, 16, ..., -26, 13, -9],\n", " [ 43, 60, -30, ..., -26, 40, -32],\n", " [ 15, -9, 19, ..., -43, 16, 0],\n", " ...,\n", " [ 1, -75, -19, ..., -14, 18, -16],\n", " [ 12, -47, -20, ..., -58, -5, 44],\n", " [-17, -69, 15, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.7.mlp.c_fc.bias => tensor([-0.0525, -0.1953, 0.0981, ..., -0.0510, -0.0967, 0.2969],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.mlp.c_fc.SCB => tensor([0.1216, 0.1279, 0.1631, ..., 0.1108, 0.1338, 0.1680], device='cuda:0')\n", "transformer.h.7.mlp.c_fc.weight_format => col_turing\n", "transformer.h.7.mlp.c_fc2.weight => tensor([[-23, 38, -7, ..., -35, 52, -52],\n", " [ 28, -53, -6, ..., 29, 82, -4],\n", " [-12, -47, 69, ..., -34, 11, 13],\n", " ...,\n", " [ -4, -81, 11, ..., -22, 12, -19],\n", " [ -8, -49, -47, ..., 34, 47, 22],\n", " [-49, 55, -16, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.7.mlp.c_fc2.bias => tensor([-1.1328, -1.1875, -1.0312, ..., 0.7148, -1.0312, -1.4609],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.mlp.c_fc2.SCB => tensor([0.1250, 0.1147, 0.1089, ..., 0.1025, 0.1177, 0.1206], device='cuda:0')\n", "transformer.h.7.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.7.mlp.c_proj.weight => tensor([[-22, 16, 4, ..., 29, 48, 18],\n", " [-44, 32, -8, ..., 35, 30, -37],\n", " [-16, -14, 29, ..., 26, 30, -32],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 54, 51, -34, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.7.mlp.c_proj.bias => tensor([-1.2422, 0.5234, 0.3496, ..., 0.0991, 0.1416, -0.0052],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.7.mlp.c_proj.SCB => tensor([0.1553, 0.1338, 0.1309, ..., 0.1318, 0.1738, 0.1406], device='cuda:0')\n", "transformer.h.7.mlp.c_proj.weight_format => col_turing\n", "transformer.h.8.ln_1.weight => tensor([1.4297, 1.7188, 1.8281, ..., 1.5000, 1.5078, 1.5938], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.8.ln_1.bias => tensor([ 0.1162, -0.7422, -0.2832, ..., -0.0579, 0.0067, 0.1494],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.attn.c_attn.weight => tensor([[-64, 21, 30, ..., 27, 10, 32],\n", " [ 3, 32, -18, ..., -14, -14, 4],\n", " [ 27, -15, -7, ..., 14, 30, 6],\n", " ...,\n", " [-38, 14, 44, ..., 21, 26, -9],\n", " [-19, -10, -17, ..., 32, 50, 47],\n", " [ 10, -52, 31, ..., 14, -13, -13]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.8.attn.c_attn.bias => tensor([-0.1206, -0.3984, 0.0898, ..., -0.0593, 0.0515, 0.0952],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.attn.c_attn.SCB => tensor([0.1338, 0.1494, 0.1660, ..., 0.1050, 0.1079, 0.1279], device='cuda:0')\n", "transformer.h.8.attn.c_attn.weight_format => col_turing\n", "transformer.h.8.attn.c_proj.weight => tensor([[ 9, -1, 28, ..., 13, 19, 10],\n", " [ 39, 4, -15, ..., -29, -5, -38],\n", " [-18, 19, 7, ..., -8, 2, 4],\n", " ...,\n", " [-30, -44, 0, ..., -64, 30, -3],\n", " [ 7, -1, -5, ..., -29, -38, -47],\n", " [-22, -5, 0, ..., 34, 10, 8]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.8.attn.c_proj.bias => tensor([ 0.2734, 0.7383, 0.1123, ..., -0.1709, 0.0674, -0.1973],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.attn.c_proj.SCB => tensor([0.1235, 0.0972, 0.0972, ..., 0.0864, 0.0928, 0.1147], device='cuda:0')\n", "transformer.h.8.attn.c_proj.weight_format => col_turing\n", "transformer.h.8.ln_2.weight => tensor([0.5195, 0.8867, 0.9102, ..., 0.7695, 0.8125, 1.0312], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.8.ln_2.bias => tensor([ 0.0099, -0.4277, -0.1226, ..., -0.0096, -0.0047, 0.1299],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.mlp.c_fc.weight => tensor([[-37, 3, 27, ..., 13, 21, 17],\n", " [ 16, -2, 35, ..., -33, -14, -47],\n", " [-14, 19, 15, ..., -30, 2, 7],\n", " ...,\n", " [-21, 45, 8, ..., -24, 11, -28],\n", " [ 15, 24, -11, ..., -38, -37, 4],\n", " [ 18, -9, -5, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.8.mlp.c_fc.bias => tensor([-0.0427, 0.2891, 0.0620, ..., 2.3281, -0.7383, 0.1328],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.mlp.c_fc.SCB => tensor([0.1680, 0.1562, 0.1260, ..., 0.0879, 0.1309, 0.1348], device='cuda:0')\n", "transformer.h.8.mlp.c_fc.weight_format => col_turing\n", "transformer.h.8.mlp.c_fc2.weight => tensor([[-17, -18, 9, ..., -24, 18, 53],\n", " [ -2, 19, -9, ..., -41, 17, 68],\n", " [-40, -9, -4, ..., 74, 44, 35],\n", " ...,\n", " [ 8, -40, 51, ..., -28, -28, -12],\n", " [ 27, 22, -56, ..., 41, 20, 8],\n", " [ 70, -25, -9, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.8.mlp.c_fc2.bias => tensor([-1.1172, -1.3438, -1.2891, ..., -0.4941, -0.3652, -1.2656],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.mlp.c_fc2.SCB => tensor([0.1064, 0.1128, 0.1177, ..., 0.1143, 0.1099, 0.1245], device='cuda:0')\n", "transformer.h.8.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.8.mlp.c_proj.weight => tensor([[ 24, -17, 5, ..., -8, 16, 21],\n", " [ 13, -17, -48, ..., -27, -9, -16],\n", " [ -3, 7, 20, ..., 13, -23, -10],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 1, 31, 13, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.8.mlp.c_proj.bias => tensor([-0.9258, 0.5781, 0.2061, ..., -0.0283, -0.0243, 0.3496],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.8.mlp.c_proj.SCB => tensor([0.1152, 0.1553, 0.1426, ..., 0.1533, 0.1318, 0.1328], device='cuda:0')\n", "transformer.h.8.mlp.c_proj.weight_format => col_turing\n", "transformer.h.9.ln_1.weight => tensor([1.6641, 2.0781, 2.2344, ..., 1.7656, 1.7891, 1.8516], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.9.ln_1.bias => tensor([ 0.0820, -0.8203, -0.3457, ..., -0.0569, 0.0393, 0.1699],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.attn.c_attn.weight => tensor([[-37, 8, 50, ..., -25, 24, 15],\n", " [-11, 24, -50, ..., 11, -61, -14],\n", " [-27, -37, 16, ..., 13, -18, 52],\n", " ...,\n", " [ 1, -26, -34, ..., -8, 26, 22],\n", " [-23, 13, -41, ..., -64, -33, -14],\n", " [ 34, 27, 21, ..., -41, -60, -6]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.9.attn.c_attn.bias => tensor([-0.4395, -0.4844, -0.8438, ..., -0.0304, 0.0082, -0.0062],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.attn.c_attn.SCB => tensor([0.1191, 0.1221, 0.1128, ..., 0.1074, 0.1099, 0.0957], device='cuda:0')\n", "transformer.h.9.attn.c_attn.weight_format => col_turing\n", "transformer.h.9.attn.c_proj.weight => tensor([[ 18, -6, 21, ..., -13, -45, -3],\n", " [ 21, -46, -26, ..., -22, -10, 3],\n", " [ 2, 22, 50, ..., -24, 29, 10],\n", " ...,\n", " [-37, 35, 33, ..., 20, 13, -23],\n", " [ 3, -8, -2, ..., 17, -54, -11],\n", " [ -5, -26, 49, ..., -31, -9, -9]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.9.attn.c_proj.bias => tensor([ 0.0042, 0.6445, 0.0703, ..., -0.0354, -0.1455, -0.2402],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.attn.c_proj.SCB => tensor([0.1455, 0.1035, 0.1055, ..., 0.1099, 0.0918, 0.1201], device='cuda:0')\n", "transformer.h.9.attn.c_proj.weight_format => col_turing\n", "transformer.h.9.ln_2.weight => tensor([0.5547, 0.9375, 0.9375, ..., 0.8008, 0.8359, 1.0625], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.9.ln_2.bias => tensor([ 0.0258, -0.3887, -0.1455, ..., -0.0293, 0.0444, 0.1406],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.mlp.c_fc.weight => tensor([[-15, 21, -93, ..., -22, -15, 28],\n", " [ 41, 2, 58, ..., -24, 35, -19],\n", " [ 45, 27, -24, ..., -87, 25, 61],\n", " ...,\n", " [ -9, -35, 19, ..., -18, -1, -56],\n", " [-27, -21, -22, ..., -54, -40, -11],\n", " [ 39, 31, -2, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.9.mlp.c_fc.bias => tensor([ 0.0273, 0.2559, 0.0071, ..., 0.0188, -0.0615, 0.1011],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.mlp.c_fc.SCB => tensor([0.1426, 0.1611, 0.1582, ..., 0.1157, 0.1187, 0.1328], device='cuda:0')\n", "transformer.h.9.mlp.c_fc.weight_format => col_turing\n", "transformer.h.9.mlp.c_fc2.weight => tensor([[-12, -39, 3, ..., -6, 26, 27],\n", " [-15, 9, -19, ..., 17, 8, 31],\n", " [-31, 25, 18, ..., -18, -30, -3],\n", " ...,\n", " [-12, -21, 115, ..., 14, 3, -21],\n", " [-42, -41, -41, ..., 55, -18, -30],\n", " [ 40, 47, -20, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.9.mlp.c_fc2.bias => tensor([-1.1719, -1.3203, -1.2109, ..., -0.2539, -0.8398, -1.6875],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.mlp.c_fc2.SCB => tensor([0.1235, 0.1279, 0.1436, ..., 0.1128, 0.1377, 0.1260], device='cuda:0')\n", "transformer.h.9.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.9.mlp.c_proj.weight => tensor([[-39, -36, -11, ..., -21, 37, 42],\n", " [ 15, 30, 15, ..., 2, -3, -46],\n", " [ 3, 43, -17, ..., -35, 20, 6],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 13, -19, -10, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.9.mlp.c_proj.bias => tensor([-0.7422, 0.7305, 0.8750, ..., -0.0864, 0.0791, 0.0178],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.9.mlp.c_proj.SCB => tensor([0.1338, 0.1475, 0.1533, ..., 0.1445, 0.1348, 0.1445], device='cuda:0')\n", "transformer.h.9.mlp.c_proj.weight_format => col_turing\n", "transformer.h.10.ln_1.weight => tensor([1.6250, 2.0938, 2.3594, ..., 1.7812, 1.7422, 1.9453], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.10.ln_1.bias => tensor([ 0.1182, -0.8086, -0.3477, ..., -0.0044, 0.0356, 0.1787],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.attn.c_attn.weight => tensor([[-29, -55, -21, ..., 44, -28, 70],\n", " [ -4, 21, 0, ..., 8, 64, -8],\n", " [ -1, -6, -43, ..., 18, 5, 25],\n", " ...,\n", " [ 70, 43, 26, ..., 33, 0, 9],\n", " [-14, -45, 0, ..., -42, -60, 68],\n", " [-43, 0, -1, ..., -13, 25, 38]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.10.attn.c_attn.bias => tensor([-0.0283, 3.3125, -1.5938, ..., -0.0084, 0.0566, -0.3145],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.attn.c_attn.SCB => tensor([0.1328, 0.1240, 0.1436, ..., 0.1475, 0.1367, 0.1187], device='cuda:0')\n", "transformer.h.10.attn.c_attn.weight_format => col_turing\n", "transformer.h.10.attn.c_proj.weight => tensor([[ 17, -10, 19, ..., -16, -24, -31],\n", " [-47, -32, -24, ..., 17, -12, 1],\n", " [ -6, -31, -17, ..., 1, -8, -23],\n", " ...,\n", " [ 4, -41, 15, ..., 36, 17, 28],\n", " [ 11, 25, 17, ..., -9, 37, -5],\n", " [ 23, -15, 14, ..., -15, 8, 35]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.10.attn.c_proj.bias => tensor([ 0.0557, 0.4082, 0.2402, ..., 0.0325, 0.0483, -0.0874],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.attn.c_proj.SCB => tensor([0.1177, 0.1177, 0.1079, ..., 0.1040, 0.1001, 0.1172], device='cuda:0')\n", "transformer.h.10.attn.c_proj.weight_format => col_turing\n", "transformer.h.10.ln_2.weight => tensor([0.5664, 0.9688, 0.9297, ..., 0.8008, 0.8555, 1.0547], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.10.ln_2.bias => tensor([ 0.0474, -0.3730, -0.1216, ..., 0.0016, 0.0232, 0.1069],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.mlp.c_fc.weight => tensor([[ 6, -48, -23, ..., 10, -22, 26],\n", " [ 38, 41, 38, ..., 28, -9, 28],\n", " [ 11, -49, -32, ..., 21, -36, -38],\n", " ...,\n", " [ 2, 51, 48, ..., -8, 27, 16],\n", " [-26, -64, 8, ..., -37, -6, 3],\n", " [ 25, 13, 88, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.10.mlp.c_fc.bias => tensor([ 0.1069, 0.2080, -0.1348, ..., 0.2852, 0.1328, 0.0214],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.mlp.c_fc.SCB => tensor([0.1660, 0.1309, 0.1885, ..., 0.1289, 0.1631, 0.1230], device='cuda:0')\n", "transformer.h.10.mlp.c_fc.weight_format => col_turing\n", "transformer.h.10.mlp.c_fc2.weight => tensor([[ 27, -22, 12, ..., 31, 6, -15],\n", " [ -5, -68, -32, ..., -4, -35, 5],\n", " [-27, 31, -24, ..., -51, 11, -44],\n", " ...,\n", " [ 3, -16, 0, ..., -33, 43, 17],\n", " [ 42, 2, -17, ..., -36, 41, -19],\n", " [ 9, 40, -31, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.10.mlp.c_fc2.bias => tensor([-1.0703, -0.5938, -1.5625, ..., -0.8750, -1.0234, -0.8750],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.mlp.c_fc2.SCB => tensor([0.1021, 0.1040, 0.1094, ..., 0.0947, 0.1069, 0.1030], device='cuda:0')\n", "transformer.h.10.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.10.mlp.c_proj.weight => tensor([[-33, -2, 20, ..., 25, -7, -3],\n", " [-14, 63, 19, ..., 23, 1, -24],\n", " [-23, -19, 51, ..., -13, 31, 17],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-14, -41, -37, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.10.mlp.c_proj.bias => tensor([-0.5078, 0.6953, 0.8398, ..., 0.1631, 0.1816, -0.2148],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.10.mlp.c_proj.SCB => tensor([0.1455, 0.1338, 0.1357, ..., 0.1787, 0.1475, 0.1543], device='cuda:0')\n", "transformer.h.10.mlp.c_proj.weight_format => col_turing\n", "transformer.h.11.ln_1.weight => tensor([1.7266, 2.0625, 2.3594, ..., 1.7969, 1.7266, 1.9844], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.11.ln_1.bias => tensor([ 0.1221, -0.7891, -0.3320, ..., 0.0194, 0.0603, 0.2100],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.attn.c_attn.weight => tensor([[-31, -30, -17, ..., 12, -20, -41],\n", " [ 3, 42, 41, ..., 5, -25, 34],\n", " [ 11, 44, -64, ..., -4, 28, 11],\n", " ...,\n", " [-13, 34, 2, ..., -16, 16, -9],\n", " [ 20, 5, 32, ..., 8, -43, -19],\n", " [-12, 36, -1, ..., 4, -19, 20]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.11.attn.c_attn.bias => tensor([ 0.4043, 0.5820, 0.2734, ..., -0.1211, -0.2383, 0.1147],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.attn.c_attn.SCB => tensor([0.1357, 0.1387, 0.1206, ..., 0.1416, 0.1738, 0.1523], device='cuda:0')\n", "transformer.h.11.attn.c_attn.weight_format => col_turing\n", "transformer.h.11.attn.c_proj.weight => tensor([[ -39, 2, -4, ..., -27, -11, 4],\n", " [ -18, 46, -41, ..., -17, -35, -32],\n", " [ -26, -14, -11, ..., 18, -34, -12],\n", " ...,\n", " [ -10, -35, 5, ..., -51, 31, -55],\n", " [ 20, -17, -68, ..., -40, -127, -10],\n", " [ 41, 42, 87, ..., 14, -11, -5]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.11.attn.c_proj.bias => tensor([ 0.1670, 0.6211, 0.1660, ..., 0.3848, 0.0238, -0.4375],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.attn.c_proj.SCB => tensor([0.1021, 0.1270, 0.1396, ..., 0.1094, 0.0991, 0.1484], device='cuda:0')\n", "transformer.h.11.attn.c_proj.weight_format => col_turing\n", "transformer.h.11.ln_2.weight => tensor([0.6133, 0.9844, 0.9727, ..., 0.8398, 0.8711, 1.0938], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.11.ln_2.bias => tensor([ 0.0474, -0.3945, -0.1650, ..., -0.0269, 0.0243, 0.1504],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.mlp.c_fc.weight => tensor([[ 1, -27, 3, ..., -11, -6, 36],\n", " [ 9, 22, 25, ..., -12, 17, 13],\n", " [ 62, 72, 1, ..., -45, 13, 33],\n", " ...,\n", " [ 55, -7, -44, ..., -24, -4, 14],\n", " [-19, 19, -39, ..., 32, -20, -17],\n", " [-39, 11, 46, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.11.mlp.c_fc.bias => tensor([-1.3672, -0.8438, 0.9453, ..., 0.4062, 1.1641, -0.2383],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.mlp.c_fc.SCB => tensor([0.1035, 0.1177, 0.1396, ..., 0.1309, 0.1270, 0.1406], device='cuda:0')\n", "transformer.h.11.mlp.c_fc.weight_format => col_turing\n", "transformer.h.11.mlp.c_fc2.weight => tensor([[-24, -55, -24, ..., -20, -47, -9],\n", " [-34, 10, 31, ..., 5, -1, 13],\n", " [-20, -20, -87, ..., -47, 88, 28],\n", " ...,\n", " [ -7, 11, 20, ..., 24, -58, -29],\n", " [-18, 40, 10, ..., 7, 1, 22],\n", " [-65, 47, 10, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.11.mlp.c_fc2.bias => tensor([-0.0432, -0.2812, -0.5586, ..., -0.4160, -1.5156, 0.5898],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.mlp.c_fc2.SCB => tensor([0.1055, 0.1064, 0.1289, ..., 0.1128, 0.1729, 0.1030], device='cuda:0')\n", "transformer.h.11.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.11.mlp.c_proj.weight => tensor([[-29, -1, -12, ..., 15, 36, 49],\n", " [ 9, 25, -10, ..., -6, -39, -23],\n", " [ 15, 24, 0, ..., 17, 4, 9],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 18, 10, -6, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.11.mlp.c_proj.bias => tensor([-0.1553, 1.0391, 0.6445, ..., 0.5078, 0.2324, -0.4258],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.11.mlp.c_proj.SCB => tensor([0.1436, 0.1426, 0.1230, ..., 0.1553, 0.1338, 0.1377], device='cuda:0')\n", "transformer.h.11.mlp.c_proj.weight_format => col_turing\n", "transformer.h.12.ln_1.weight => tensor([1.7734, 2.3906, 2.5781, ..., 2.0625, 1.9453, 2.2031], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.12.ln_1.bias => tensor([ 0.1523, -0.8750, -0.4160, ..., 0.0112, 0.0408, 0.2285],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.attn.c_attn.weight => tensor([[-30, -38, -10, ..., 4, 13, 12],\n", " [ -8, 55, 35, ..., -30, -22, -11],\n", " [-27, 40, -30, ..., 3, -27, -6],\n", " ...,\n", " [ -2, -17, 48, ..., -26, -23, -9],\n", " [-38, 8, -24, ..., 4, 47, 28],\n", " [ 36, -74, 9, ..., 29, 56, 49]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.12.attn.c_attn.bias => tensor([-0.5234, 0.2832, 0.0142, ..., 0.2852, 0.1572, 0.0349],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.attn.c_attn.SCB => tensor([0.1230, 0.1426, 0.1069, ..., 0.1533, 0.1631, 0.1328], device='cuda:0')\n", "transformer.h.12.attn.c_attn.weight_format => col_turing\n", "transformer.h.12.attn.c_proj.weight => tensor([[ -4, 2, -8, ..., 4, -27, 30],\n", " [-45, -5, 17, ..., -27, -1, -56],\n", " [ 11, -11, -21, ..., 17, -10, 26],\n", " ...,\n", " [-48, -30, -79, ..., 26, -23, -51],\n", " [-34, -9, -33, ..., -75, -29, -66],\n", " [-67, 42, 71, ..., -41, -46, -4]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.12.attn.c_proj.bias => tensor([ 0.3125, 0.8789, 0.2578, ..., 0.6914, -0.1079, -0.6328],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.attn.c_proj.SCB => tensor([0.1270, 0.1455, 0.1240, ..., 0.1147, 0.1250, 0.1221], device='cuda:0')\n", "transformer.h.12.attn.c_proj.weight_format => col_turing\n", "transformer.h.12.ln_2.weight => tensor([0.6680, 1.0391, 1.0156, ..., 0.8906, 0.9258, 1.1250], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.12.ln_2.bias => tensor([ 0.0530, -0.4414, -0.1670, ..., -0.0309, 0.0403, 0.1709],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.mlp.c_fc.weight => tensor([[-54, -2, 3, ..., -2, -57, -66],\n", " [ -4, -8, 10, ..., 106, 16, -27],\n", " [ 26, 24, 27, ..., 36, -15, 46],\n", " ...,\n", " [ 48, 25, -20, ..., -7, 10, 14],\n", " [ 40, 14, -6, ..., -46, 25, -36],\n", " [ 58, 71, -38, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.12.mlp.c_fc.bias => tensor([-0.4238, -0.8984, -0.3281, ..., 0.2061, -0.0933, 0.2539],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.mlp.c_fc.SCB => tensor([0.1416, 0.1094, 0.1118, ..., 0.1187, 0.1602, 0.1328], device='cuda:0')\n", "transformer.h.12.mlp.c_fc.weight_format => col_turing\n", "transformer.h.12.mlp.c_fc2.weight => tensor([[-46, -54, 12, ..., 12, -61, -36],\n", " [ 54, 39, 21, ..., 1, -5, 47],\n", " [ 37, 21, -79, ..., -42, -38, 16],\n", " ...,\n", " [-15, -25, 43, ..., 43, 74, 7],\n", " [-29, 18, 28, ..., 17, -24, -23],\n", " [ 31, 0, 26, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.12.mlp.c_fc2.bias => tensor([-1.0391, -0.1348, 0.6992, ..., 0.2676, -0.7617, 0.5547],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.mlp.c_fc2.SCB => tensor([0.1108, 0.1069, 0.0938, ..., 0.1011, 0.1030, 0.1377], device='cuda:0')\n", "transformer.h.12.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.12.mlp.c_proj.weight => tensor([[-20, -67, -42, ..., 36, 5, 25],\n", " [-10, -48, -4, ..., 1, 57, -11],\n", " [ 2, 21, -16, ..., -25, -2, 36],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 18, -27, -26, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.12.mlp.c_proj.bias => tensor([-0.1260, 0.9062, 0.7383, ..., 0.4844, 0.1245, -0.4336],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.12.mlp.c_proj.SCB => tensor([0.1367, 0.1328, 0.1484, ..., 0.1523, 0.1260, 0.1230], device='cuda:0')\n", "transformer.h.12.mlp.c_proj.weight_format => col_turing\n", "transformer.h.13.ln_1.weight => tensor([1.8594, 2.4219, 2.6094, ..., 2.2031, 2.0938, 2.2344], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.13.ln_1.bias => tensor([ 0.1147, -0.9219, -0.4277, ..., -0.0181, 0.0197, 0.3242],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.attn.c_attn.weight => tensor([[ 43, 56, -23, ..., 43, -38, 3],\n", " [ 22, -54, 8, ..., 22, 60, -13],\n", " [ 23, 39, -40, ..., -5, 10, 6],\n", " ...,\n", " [ 7, 22, 18, ..., 27, -14, 34],\n", " [-16, -32, -16, ..., -4, 27, 6],\n", " [-32, 80, 27, ..., -42, 50, 11]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.13.attn.c_attn.bias => tensor([ 0.0271, -0.2363, 0.0398, ..., 0.2207, 0.3164, 0.0149],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.attn.c_attn.SCB => tensor([0.1289, 0.1226, 0.1172, ..., 0.1553, 0.1572, 0.1357], device='cuda:0')\n", "transformer.h.13.attn.c_attn.weight_format => col_turing\n", "transformer.h.13.attn.c_proj.weight => tensor([[-25, 62, 47, ..., -56, 1, -35],\n", " [-32, 23, 14, ..., -39, 43, 9],\n", " [ 17, -2, 8, ..., 34, -17, -44],\n", " ...,\n", " [ 56, -48, 27, ..., -35, -24, -33],\n", " [ 26, -20, -13, ..., -44, -30, 55],\n", " [ 57, 51, -16, ..., -11, 11, -1]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.13.attn.c_proj.bias => tensor([ 0.3711, 1.1797, 0.2354, ..., 0.7148, -0.2832, -0.7383],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.attn.c_proj.SCB => tensor([0.1138, 0.1187, 0.1089, ..., 0.1133, 0.1045, 0.1108], device='cuda:0')\n", "transformer.h.13.attn.c_proj.weight_format => col_turing\n", "transformer.h.13.ln_2.weight => tensor([0.7227, 1.0625, 1.0391, ..., 0.9453, 0.9531, 1.1484], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.13.ln_2.bias => tensor([ 0.0315, -0.4707, -0.1436, ..., -0.0593, 0.0459, 0.2266],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.mlp.c_fc.weight => tensor([[ -16, 18, -77, ..., 4, 24, 21],\n", " [ 13, 18, -41, ..., 11, -23, -53],\n", " [ 55, 2, 3, ..., 9, -106, 22],\n", " ...,\n", " [ 34, 1, -13, ..., -10, -5, 20],\n", " [ -69, 45, -2, ..., 30, 9, 19],\n", " [ -8, 9, -4, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.13.mlp.c_fc.bias => tensor([-1.5000, -0.5039, -0.7617, ..., -0.3613, -0.1162, -0.1172],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.mlp.c_fc.SCB => tensor([0.1162, 0.1050, 0.1309, ..., 0.1108, 0.1328, 0.1631], device='cuda:0')\n", "transformer.h.13.mlp.c_fc.weight_format => col_turing\n", "transformer.h.13.mlp.c_fc2.weight => tensor([[-49, 2, 39, ..., 59, -29, -4],\n", " [ 37, -50, -7, ..., -14, -8, -36],\n", " [-10, 0, 43, ..., 0, -12, -19],\n", " ...,\n", " [ 13, -13, -11, ..., -66, 18, 48],\n", " [-35, 50, -46, ..., -67, 6, -17],\n", " [-15, -25, -43, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.13.mlp.c_fc2.bias => tensor([ 0.4668, 0.0500, -0.4453, ..., -0.3965, -0.9258, -0.7969],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.mlp.c_fc2.SCB => tensor([0.1060, 0.1079, 0.1040, ..., 0.0986, 0.1035, 0.1177], device='cuda:0')\n", "transformer.h.13.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.13.mlp.c_proj.weight => tensor([[-58, -24, 42, ..., 27, -21, 35],\n", " [-40, -29, -31, ..., -38, -11, -27],\n", " [ 4, 1, -9, ..., -14, 28, 85],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-21, -38, 18, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.13.mlp.c_proj.bias => tensor([ 0.1069, 1.0234, 0.5703, ..., 0.4395, 0.1279, -0.1514],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.13.mlp.c_proj.SCB => tensor([0.1455, 0.1572, 0.1377, ..., 0.1357, 0.1484, 0.1494], device='cuda:0')\n", "transformer.h.13.mlp.c_proj.weight_format => col_turing\n", "transformer.h.14.ln_1.weight => tensor([1.9453, 2.3750, 2.7188, ..., 2.1719, 2.0938, 2.3281], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.14.ln_1.bias => tensor([ 0.0566, -0.9219, -0.3457, ..., -0.0605, 0.0718, 0.3555],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.attn.c_attn.weight => tensor([[ 23, -2, -19, ..., 63, 30, 1],\n", " [-42, 26, 5, ..., 6, -36, 23],\n", " [ 3, -32, 7, ..., 9, -97, -34],\n", " ...,\n", " [-59, 9, -52, ..., 11, -32, 2],\n", " [ 12, 10, -11, ..., 15, -17, 19],\n", " [ 80, -43, -13, ..., 31, 52, 9]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.14.attn.c_attn.bias => tensor([ 0.1523, -0.2891, -0.1138, ..., 0.1064, -0.1279, 0.0669],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.attn.c_attn.SCB => tensor([0.1089, 0.1060, 0.1270, ..., 0.1260, 0.1348, 0.1309], device='cuda:0')\n", "transformer.h.14.attn.c_attn.weight_format => col_turing\n", "transformer.h.14.attn.c_proj.weight => tensor([[ 13, -28, -9, ..., -34, -50, -1],\n", " [-19, 12, 35, ..., -57, -6, 21],\n", " [ 33, 8, 22, ..., -38, 41, -27],\n", " ...,\n", " [-60, 76, -9, ..., -25, -9, 19],\n", " [-51, -13, -4, ..., -5, -6, -12],\n", " [ 12, 22, 17, ..., -8, 4, 73]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.14.attn.c_proj.bias => tensor([ 0.2451, 1.2031, 0.9375, ..., 0.2383, -0.2451, -1.4297],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.attn.c_proj.SCB => tensor([0.1104, 0.1377, 0.1113, ..., 0.1001, 0.1260, 0.1318], device='cuda:0')\n", "transformer.h.14.attn.c_proj.weight_format => col_turing\n", "transformer.h.14.ln_2.weight => tensor([0.8516, 1.1406, 1.0781, ..., 1.0391, 1.0391, 1.2188], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.14.ln_2.bias => tensor([ 0.0356, -0.4980, -0.2051, ..., -0.0347, 0.0540, 0.2969],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.mlp.c_fc.weight => tensor([[ 34, -32, 58, ..., -5, 5, -8],\n", " [-11, 43, -2, ..., 14, 39, -23],\n", " [ 30, 39, 34, ..., -23, -13, 48],\n", " ...,\n", " [ 15, -4, 22, ..., 6, 10, -12],\n", " [ 25, -45, 1, ..., 36, -31, 20],\n", " [-25, -10, 3, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.14.mlp.c_fc.bias => tensor([ 0.2734, 0.3379, 0.7461, ..., 0.0908, -0.1504, -0.2969],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.mlp.c_fc.SCB => tensor([0.1406, 0.1104, 0.1641, ..., 0.1367, 0.1260, 0.1553], device='cuda:0')\n", "transformer.h.14.mlp.c_fc.weight_format => col_turing\n", "transformer.h.14.mlp.c_fc2.weight => tensor([[ 20, 27, -6, ..., 23, -26, 14],\n", " [-16, 4, -8, ..., 44, -17, 34],\n", " [ 26, 5, 61, ..., 6, -1, 46],\n", " ...,\n", " [ 23, 37, -43, ..., 15, -47, -16],\n", " [ -2, 24, 8, ..., 4, -13, 2],\n", " [-23, 48, 7, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.14.mlp.c_fc2.bias => tensor([-1.2344, -0.9609, -1.0625, ..., -0.4629, -0.9648, -1.3672],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.mlp.c_fc2.SCB => tensor([0.1211, 0.1328, 0.1172, ..., 0.0928, 0.1211, 0.1177], device='cuda:0')\n", "transformer.h.14.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.14.mlp.c_proj.weight => tensor([[100, 58, 31, ..., -3, 23, -22],\n", " [ 42, 33, 9, ..., -36, 14, 42],\n", " [ 33, -18, 26, ..., -25, 45, 31],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 15, 13, -23, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.14.mlp.c_proj.bias => tensor([ 0.2305, 1.1953, 0.9570, ..., 0.1777, -0.2041, -0.6953],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.14.mlp.c_proj.SCB => tensor([0.1260, 0.1367, 0.1885, ..., 0.1387, 0.1309, 0.1338], device='cuda:0')\n", "transformer.h.14.mlp.c_proj.weight_format => col_turing\n", "transformer.h.15.ln_1.weight => tensor([2.3438, 2.7031, 2.9844, ..., 2.5781, 2.5625, 2.6406], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.15.ln_1.bias => tensor([ 0.0486, -1.0156, -0.5508, ..., -0.0403, 0.0684, 0.4453],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.attn.c_attn.weight => tensor([[ 32, 31, 35, ..., -14, -1, -18],\n", " [ 73, 20, 23, ..., 14, -13, -37],\n", " [ -5, 45, -29, ..., -22, -18, -32],\n", " ...,\n", " [-38, -22, 23, ..., -18, 1, 12],\n", " [-40, 21, -24, ..., 23, -2, 56],\n", " [ 4, 27, -19, ..., 32, 9, -20]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.15.attn.c_attn.bias => tensor([ 0.2363, 0.5430, -0.3340, ..., 0.0962, 0.0615, 0.1885],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.attn.c_attn.SCB => tensor([0.1348, 0.1367, 0.1079, ..., 0.1387, 0.1885, 0.1592], device='cuda:0')\n", "transformer.h.15.attn.c_attn.weight_format => col_turing\n", "transformer.h.15.attn.c_proj.weight => tensor([[ -5, -32, -58, ..., 2, -4, -36],\n", " [ -1, 45, 6, ..., -30, -34, -5],\n", " [ 18, 5, 23, ..., 14, 32, -10],\n", " ...,\n", " [ 21, -31, -27, ..., 17, 31, 7],\n", " [ 8, 16, -15, ..., 16, -43, 12],\n", " [ 2, -42, 22, ..., 12, -1, -65]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.15.attn.c_proj.bias => tensor([-0.0752, 1.3203, 0.8906, ..., 0.0128, -0.3750, -1.0703],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.attn.c_proj.SCB => tensor([0.1196, 0.1279, 0.1348, ..., 0.1475, 0.1445, 0.1748], device='cuda:0')\n", "transformer.h.15.attn.c_proj.weight_format => col_turing\n", "transformer.h.15.ln_2.weight => tensor([0.9570, 1.2578, 1.1562, ..., 1.1406, 1.1719, 1.3438], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.15.ln_2.bias => tensor([ 0.0679, -0.5352, -0.2930, ..., -0.0306, 0.0537, 0.3418],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.mlp.c_fc.weight => tensor([[ 8, -10, 31, ..., 3, -3, 3],\n", " [ -8, -3, 53, ..., -8, 11, 3],\n", " [ -5, -15, -8, ..., -8, 25, 35],\n", " ...,\n", " [ 37, 40, 6, ..., 54, -40, 45],\n", " [-34, 15, -3, ..., -24, -15, 9],\n", " [ 9, -37, 38, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.15.mlp.c_fc.bias => tensor([-1.0078, -0.2012, -0.3906, ..., -0.5312, -0.4688, 0.1924],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.mlp.c_fc.SCB => tensor([0.1338, 0.1143, 0.1367, ..., 0.1260, 0.1416, 0.1357], device='cuda:0')\n", "transformer.h.15.mlp.c_fc.weight_format => col_turing\n", "transformer.h.15.mlp.c_fc2.weight => tensor([[ 10, 4, 16, ..., 44, -31, -33],\n", " [ 18, 38, 16, ..., 32, -42, -8],\n", " [ 79, 16, 45, ..., 16, -13, 43],\n", " ...,\n", " [ -6, -20, 9, ..., -24, 45, 13],\n", " [ 5, 20, -17, ..., 54, -4, -3],\n", " [ 62, 46, -31, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.15.mlp.c_fc2.bias => tensor([-0.2852, 0.7539, -1.9688, ..., -0.6133, -0.2695, -0.1523],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.mlp.c_fc2.SCB => tensor([0.1143, 0.1001, 0.1235, ..., 0.1138, 0.0986, 0.1079], device='cuda:0')\n", "transformer.h.15.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.15.mlp.c_proj.weight => tensor([[ 15, -7, 93, ..., 78, 35, -34],\n", " [ 37, 3, 30, ..., 32, -21, -55],\n", " [ 17, -40, -16, ..., -31, 25, -100],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 7, -18, -1, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.15.mlp.c_proj.bias => tensor([-0.1074, 1.0156, 0.7539, ..., 0.0854, -0.3184, -0.4023],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.15.mlp.c_proj.SCB => tensor([0.1348, 0.1367, 0.1396, ..., 0.1289, 0.1562, 0.1650], device='cuda:0')\n", "transformer.h.15.mlp.c_proj.weight_format => col_turing\n", "transformer.h.16.ln_1.weight => tensor([2.5938, 2.7188, 2.9219, ..., 2.7344, 2.6094, 2.6406], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.16.ln_1.bias => tensor([ 0.0132, -0.9961, -0.5586, ..., -0.1104, 0.1143, 0.4746],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.attn.c_attn.weight => tensor([[ 22, 89, -33, ..., -90, 5, -28],\n", " [ 12, -87, 44, ..., -36, -82, 47],\n", " [ 48, -4, -16, ..., 28, 41, 57],\n", " ...,\n", " [ -3, 60, 14, ..., -15, 7, 27],\n", " [ 23, -1, -30, ..., 19, -3, 5],\n", " [ -7, -51, 36, ..., 29, 32, 9]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.16.attn.c_attn.bias => tensor([-0.2871, 0.3301, -0.0664, ..., 0.1924, -0.0479, -0.0903],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.attn.c_attn.SCB => tensor([0.1133, 0.1128, 0.0991, ..., 0.1680, 0.1621, 0.1611], device='cuda:0')\n", "transformer.h.16.attn.c_attn.weight_format => col_turing\n", "transformer.h.16.attn.c_proj.weight => tensor([[ -7, 5, -22, ..., 49, 18, -4],\n", " [ -9, 38, 4, ..., -25, -32, 23],\n", " [ 4, 15, 28, ..., -16, -25, -68],\n", " ...,\n", " [-43, -58, 45, ..., 9, 46, 42],\n", " [ 38, -24, 8, ..., -5, -3, -7],\n", " [ 58, -14, 2, ..., -58, -27, -4]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.16.attn.c_proj.bias => tensor([ 0.1074, 1.2500, 1.2500, ..., -0.0427, -0.6172, -1.1719],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.attn.c_proj.SCB => tensor([0.1484, 0.1270, 0.1357, ..., 0.1357, 0.1396, 0.1309], device='cuda:0')\n", "transformer.h.16.attn.c_proj.weight_format => col_turing\n", "transformer.h.16.ln_2.weight => tensor([1.0078, 1.2969, 1.1953, ..., 1.2266, 1.2109, 1.4062], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.16.ln_2.bias => tensor([ 0.0150, -0.5312, -0.3398, ..., -0.0337, 0.1182, 0.3711],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.mlp.c_fc.weight => tensor([[ -2, -4, 7, ..., 30, -47, 40],\n", " [ 3, -12, -37, ..., -34, 4, 40],\n", " [ 5, -2, 28, ..., 20, 46, 28],\n", " ...,\n", " [ 52, -13, 23, ..., -25, 36, 33],\n", " [-23, 7, -82, ..., 60, 43, 16],\n", " [ 10, -9, 39, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.16.mlp.c_fc.bias => tensor([ 0.5820, -0.6094, 0.4609, ..., 0.0036, -0.6016, -0.0092],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.mlp.c_fc.SCB => tensor([0.1299, 0.1108, 0.1299, ..., 0.1328, 0.1260, 0.1416], device='cuda:0')\n", "transformer.h.16.mlp.c_fc.weight_format => col_turing\n", "transformer.h.16.mlp.c_fc2.weight => tensor([[-45, -20, 22, ..., -22, 18, -6],\n", " [ 25, -12, 21, ..., -1, 7, -11],\n", " [ -5, -59, -17, ..., -2, -18, -18],\n", " ...,\n", " [-46, 13, -5, ..., -24, 31, -26],\n", " [ 58, -14, 16, ..., 8, 62, 6],\n", " [ 38, 57, 13, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.16.mlp.c_fc2.bias => tensor([ 0.0135, 0.1040, -0.9453, ..., 0.3945, -0.4961, -0.7539],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.mlp.c_fc2.SCB => tensor([0.1270, 0.1240, 0.1396, ..., 0.1279, 0.1128, 0.1089], device='cuda:0')\n", "transformer.h.16.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.16.mlp.c_proj.weight => tensor([[ 7, -16, -59, ..., 12, -13, 10],\n", " [-33, -35, 31, ..., -83, -11, -4],\n", " [-12, 17, -6, ..., 7, -36, -10],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 23, -14, 8, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.16.mlp.c_proj.bias => tensor([-0.1069, 1.2422, 1.3281, ..., -0.0088, -0.4102, -0.7930],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.16.mlp.c_proj.SCB => tensor([0.1318, 0.1533, 0.1543, ..., 0.1357, 0.1377, 0.1406], device='cuda:0')\n", "transformer.h.16.mlp.c_proj.weight_format => col_turing\n", "transformer.h.17.ln_1.weight => tensor([2.5938, 2.6250, 2.8594, ..., 2.6875, 2.5312, 2.4375], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.17.ln_1.bias => tensor([ 0.0537, -0.9922, -0.5391, ..., -0.1138, 0.0583, 0.4805],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.attn.c_attn.weight => tensor([[-29, -30, 9, ..., -19, -23, 4],\n", " [ 8, 43, 34, ..., -34, 63, -25],\n", " [-16, -23, 3, ..., -37, 31, -17],\n", " ...,\n", " [-44, -31, 22, ..., 11, 2, -17],\n", " [ -1, -17, -45, ..., -5, 64, -17],\n", " [ 41, 1, 60, ..., 27, 9, -33]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.17.attn.c_attn.bias => tensor([-0.6992, 1.0156, -0.7656, ..., -0.0496, 0.1777, 0.4277],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.attn.c_attn.SCB => tensor([0.1270, 0.1270, 0.1187, ..., 0.1689, 0.1680, 0.1924], device='cuda:0')\n", "transformer.h.17.attn.c_attn.weight_format => col_turing\n", "transformer.h.17.attn.c_proj.weight => tensor([[ 8, 10, -14, ..., -21, -30, 5],\n", " [-14, 38, -25, ..., 39, -57, -11],\n", " [-23, -6, -30, ..., 13, 15, -13],\n", " ...,\n", " [-22, -1, 64, ..., -45, 23, -6],\n", " [ -2, -22, 45, ..., -10, 52, 108],\n", " [ 6, -10, 61, ..., 28, -55, -85]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.17.attn.c_proj.bias => tensor([ 0.0967, 1.3516, 1.3750, ..., 0.0231, -0.7461, -1.0781],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.attn.c_proj.SCB => tensor([0.1104, 0.1455, 0.1357, ..., 0.1504, 0.1221, 0.1357], device='cuda:0')\n", "transformer.h.17.attn.c_proj.weight_format => col_turing\n", "transformer.h.17.ln_2.weight => tensor([1.0781, 1.3359, 1.2891, ..., 1.2969, 1.2891, 1.4766], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.17.ln_2.bias => tensor([ 0.0496, -0.5742, -0.3223, ..., -0.0396, 0.0845, 0.3984],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.mlp.c_fc.weight => tensor([[ 61, -3, -35, ..., 31, -14, -53],\n", " [ 9, -39, 25, ..., -9, -13, -17],\n", " [ 31, 28, 63, ..., -36, 64, -10],\n", " ...,\n", " [-43, -31, 56, ..., 23, 11, -4],\n", " [ 20, 5, -29, ..., -3, -19, 12],\n", " [ -4, 0, -96, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.17.mlp.c_fc.bias => tensor([-1.1406, 0.2617, 0.3066, ..., -1.5000, -0.3281, 0.8359],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.mlp.c_fc.SCB => tensor([0.1230, 0.1270, 0.1289, ..., 0.1836, 0.1582, 0.1426], device='cuda:0')\n", "transformer.h.17.mlp.c_fc.weight_format => col_turing\n", "transformer.h.17.mlp.c_fc2.weight => tensor([[ -3, 62, 34, ..., 3, -17, 5],\n", " [-16, -18, 2, ..., 9, 6, -9],\n", " [ 21, -33, -44, ..., -28, 13, 16],\n", " ...,\n", " [ 35, 49, 29, ..., -2, 2, 76],\n", " [ 13, -14, -2, ..., -12, -59, -82],\n", " [ 24, -59, 22, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.17.mlp.c_fc2.bias => tensor([-0.7188, -0.3926, -1.3984, ..., -0.8516, -0.9062, -0.9844],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.mlp.c_fc2.SCB => tensor([0.1138, 0.1064, 0.1050, ..., 0.1069, 0.1035, 0.1196], device='cuda:0')\n", "transformer.h.17.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.17.mlp.c_proj.weight => tensor([[-16, -1, 10, ..., -13, 2, 44],\n", " [-22, -24, -10, ..., 17, -10, 29],\n", " [ -4, -50, -11, ..., 15, 30, -19],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-14, -8, 24, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.17.mlp.c_proj.bias => tensor([-0.1201, 1.2422, 1.1719, ..., 0.1719, -0.5078, -0.9961],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.17.mlp.c_proj.SCB => tensor([0.1338, 0.1641, 0.1631, ..., 0.1416, 0.1299, 0.1465], device='cuda:0')\n", "transformer.h.17.mlp.c_proj.weight_format => col_turing\n", "transformer.h.18.ln_1.weight => tensor([2.6250, 2.5781, 2.7812, ..., 2.7188, 2.6406, 2.5469], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.18.ln_1.bias => tensor([ 0.0190, -1.0000, -0.5352, ..., -0.1084, 0.0908, 0.5625],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.attn.c_attn.weight => tensor([[ 10, 43, 16, ..., 2, -4, -27],\n", " [-13, 26, 3, ..., -71, -24, 16],\n", " [ -9, 0, 7, ..., -64, 4, -12],\n", " ...,\n", " [ 6, 0, -18, ..., 6, 9, -22],\n", " [ 19, -4, -43, ..., 20, 81, -59],\n", " [ 32, -18, -36, ..., -4, -24, -7]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.18.attn.c_attn.bias => tensor([-1.5781, -0.3613, 0.0120, ..., 0.1123, 0.1455, 0.3730],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.attn.c_attn.SCB => tensor([0.1064, 0.1064, 0.1133, ..., 0.1924, 0.1562, 0.1436], device='cuda:0')\n", "transformer.h.18.attn.c_attn.weight_format => col_turing\n", "transformer.h.18.attn.c_proj.weight => tensor([[ -30, -37, 54, ..., -16, -13, 38],\n", " [ -33, 39, 13, ..., -36, -32, -35],\n", " [ -9, 4, 5, ..., -5, 19, -47],\n", " ...,\n", " [ -42, -30, -19, ..., -29, 56, 12],\n", " [-103, 64, 36, ..., -7, -21, 28],\n", " [ -8, 4, 49, ..., -26, -14, 36]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.18.attn.c_proj.bias => tensor([ 0.0679, 1.4922, 1.3047, ..., -0.1357, -0.8359, -1.7031],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.attn.c_proj.SCB => tensor([0.1445, 0.1128, 0.1426, ..., 0.1572, 0.1318, 0.1289], device='cuda:0')\n", "transformer.h.18.attn.c_proj.weight_format => col_turing\n", "transformer.h.18.ln_2.weight => tensor([1.1875, 1.4141, 1.3516, ..., 1.3672, 1.3906, 1.5312], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.18.ln_2.bias => tensor([ 0.0184, -0.6445, -0.3418, ..., -0.0515, 0.0762, 0.4609],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.mlp.c_fc.weight => tensor([[ 8, 2, 58, ..., 35, 45, 4],\n", " [ 45, 13, -13, ..., -8, -8, -8],\n", " [-55, 23, 10, ..., 79, 107, 10],\n", " ...,\n", " [ 22, -23, -21, ..., -6, 14, 4],\n", " [-10, 33, 3, ..., -31, -6, -19],\n", " [-17, 16, 5, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.18.mlp.c_fc.bias => tensor([-0.3809, -0.8828, -0.0894, ..., -0.6406, 0.8711, 0.4199],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.mlp.c_fc.SCB => tensor([0.1289, 0.1084, 0.1162, ..., 0.1357, 0.1406, 0.1299], device='cuda:0')\n", "transformer.h.18.mlp.c_fc.weight_format => col_turing\n", "transformer.h.18.mlp.c_fc2.weight => tensor([[-21, 75, -19, ..., 42, 36, 13],\n", " [ 11, 3, 97, ..., -8, 15, 0],\n", " [ 50, 27, 8, ..., 39, -5, -25],\n", " ...,\n", " [ 51, -13, -39, ..., -16, -24, -7],\n", " [ 1, 41, 37, ..., -35, -17, -1],\n", " [ 28, 49, -6, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.18.mlp.c_fc2.bias => tensor([-1.2344, 0.0386, -0.1924, ..., -0.3301, -1.6719, -1.1484],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.mlp.c_fc2.SCB => tensor([0.1187, 0.1055, 0.1079, ..., 0.1030, 0.1289, 0.1211], device='cuda:0')\n", "transformer.h.18.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.18.mlp.c_proj.weight => tensor([[ 32, 10, -18, ..., 6, 10, -16],\n", " [ 4, -13, -66, ..., 8, -16, 27],\n", " [-41, -5, 5, ..., -13, 17, 20],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-41, -15, -34, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.18.mlp.c_proj.bias => tensor([ 0.1172, 1.5859, 1.1016, ..., -0.2217, -0.6289, -1.4688],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.18.mlp.c_proj.SCB => tensor([0.1396, 0.1611, 0.1523, ..., 0.1436, 0.1406, 0.1348], device='cuda:0')\n", "transformer.h.18.mlp.c_proj.weight_format => col_turing\n", "transformer.h.19.ln_1.weight => tensor([2.7656, 2.6250, 2.8750, ..., 2.6719, 2.7344, 2.5156], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.19.ln_1.bias => tensor([-9.2697e-04, -1.0547e+00, -6.1328e-01, ..., -5.2490e-02,\n", " 1.3867e-01, 6.7578e-01], device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.attn.c_attn.weight => tensor([[ 5, 70, 48, ..., 4, -24, -22],\n", " [-12, -33, -18, ..., -26, -16, -26],\n", " [ -6, -15, 19, ..., -1, 32, -7],\n", " ...,\n", " [-39, -22, 63, ..., -14, 45, -36],\n", " [-42, 20, -9, ..., 39, 28, -12],\n", " [-35, 29, 41, ..., 7, -13, 7]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.19.attn.c_attn.bias => tensor([-0.9141, 0.9297, -0.3223, ..., 0.5898, 0.6914, -0.0337],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.attn.c_attn.SCB => tensor([0.1074, 0.1216, 0.1167, ..., 0.1553, 0.1514, 0.1484], device='cuda:0')\n", "transformer.h.19.attn.c_attn.weight_format => col_turing\n", "transformer.h.19.attn.c_proj.weight => tensor([[-24, -18, -23, ..., 6, 13, -7],\n", " [ 14, 0, -24, ..., 25, 20, 20],\n", " [ -8, 20, 18, ..., -7, -30, -13],\n", " ...,\n", " [ 81, 36, -44, ..., -38, -2, -11],\n", " [ -4, -27, 23, ..., 10, -50, -3],\n", " [-12, 7, 9, ..., -39, -51, 35]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.19.attn.c_proj.bias => tensor([ 0.2109, 1.8047, 1.0859, ..., -0.4863, -1.0000, -2.1875],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.attn.c_proj.SCB => tensor([0.1504, 0.1523, 0.1367, ..., 0.1279, 0.1338, 0.1318], device='cuda:0')\n", "transformer.h.19.attn.c_proj.weight_format => col_turing\n", "transformer.h.19.ln_2.weight => tensor([1.2969, 1.5156, 1.4844, ..., 1.4922, 1.4688, 1.6250], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.19.ln_2.bias => tensor([-0.0167, -0.7109, -0.3691, ..., -0.0099, 0.1475, 0.6406],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.mlp.c_fc.weight => tensor([[-41, -18, 14, ..., 12, 3, 14],\n", " [-28, -81, 0, ..., -48, 24, 34],\n", " [ -1, -18, -11, ..., -22, -48, 33],\n", " ...,\n", " [ 3, 26, -14, ..., -25, 12, 60],\n", " [ 28, 5, 1, ..., 32, 0, -14],\n", " [ 14, 30, -5, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.19.mlp.c_fc.bias => tensor([-0.4375, -1.0156, 0.2617, ..., 0.2363, 0.8203, -1.1719],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.mlp.c_fc.SCB => tensor([0.1177, 0.1367, 0.1318, ..., 0.1128, 0.1211, 0.1787], device='cuda:0')\n", "transformer.h.19.mlp.c_fc.weight_format => col_turing\n", "transformer.h.19.mlp.c_fc2.weight => tensor([[ 58, -36, 40, ..., 44, -34, -27],\n", " [-113, -15, -39, ..., -25, 28, 38],\n", " [ 24, 14, 14, ..., 7, -32, -10],\n", " ...,\n", " [ 15, 44, -7, ..., -14, -19, -21],\n", " [ 8, 7, -25, ..., -37, 28, 10],\n", " [ 63, 12, -9, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.19.mlp.c_fc2.bias => tensor([-1.7578, -0.3535, -1.7891, ..., -0.6250, -1.0625, -2.3125],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.mlp.c_fc2.SCB => tensor([0.1060, 0.0962, 0.1074, ..., 0.0967, 0.0996, 0.1128], device='cuda:0')\n", "transformer.h.19.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.19.mlp.c_proj.weight => tensor([[ 46, 10, 20, ..., -9, 4, -24],\n", " [-40, 4, -22, ..., 53, -16, -56],\n", " [-15, -32, 15, ..., 29, -10, -13],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 35, 25, 60, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.19.mlp.c_proj.bias => tensor([ 0.0208, 1.9453, 1.2344, ..., -0.2773, -1.3438, -1.6406],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.19.mlp.c_proj.SCB => tensor([0.1318, 0.1416, 0.1475, ..., 0.1631, 0.1270, 0.1504], device='cuda:0')\n", "transformer.h.19.mlp.c_proj.weight_format => col_turing\n", "transformer.h.20.ln_1.weight => tensor([2.8125, 2.6719, 2.8125, ..., 2.7812, 2.7031, 2.5312], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.20.ln_1.bias => tensor([-0.0156, -1.1406, -0.5664, ..., -0.0273, 0.1177, 0.7461],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.attn.c_attn.weight => tensor([[ -6, -3, -8, ..., -49, 11, -12],\n", " [ -3, -18, -2, ..., 22, 13, 4],\n", " [ 50, 5, -32, ..., 3, 46, -27],\n", " ...,\n", " [-53, -20, -63, ..., -15, 36, -44],\n", " [ -4, -29, -22, ..., -67, 1, 41],\n", " [ 35, 25, -1, ..., -1, -35, 22]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.20.attn.c_attn.bias => tensor([-0.1338, 0.4707, -0.4609, ..., -0.0835, 0.0225, 0.1533],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.attn.c_attn.SCB => tensor([0.0952, 0.1182, 0.1064, ..., 0.1523, 0.1689, 0.1602], device='cuda:0')\n", "transformer.h.20.attn.c_attn.weight_format => col_turing\n", "transformer.h.20.attn.c_proj.weight => tensor([[ 2, 16, 7, ..., 12, -57, -39],\n", " [ -5, 39, 27, ..., -3, 2, -19],\n", " [ -3, 43, 29, ..., -6, 5, 7],\n", " ...,\n", " [ 54, -40, -16, ..., -50, 42, -43],\n", " [ -9, 2, 16, ..., 85, 31, -9],\n", " [-29, 5, -41, ..., -27, -33, -24]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.20.attn.c_proj.bias => tensor([ 0.5742, 1.7812, 0.4805, ..., 0.1157, -1.4766, -1.8984],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.attn.c_proj.SCB => tensor([0.1309, 0.1377, 0.1426, ..., 0.1562, 0.1230, 0.1328], device='cuda:0')\n", "transformer.h.20.attn.c_proj.weight_format => col_turing\n", "transformer.h.20.ln_2.weight => tensor([1.4062, 1.6094, 1.5547, ..., 1.5625, 1.5781, 1.6953], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.20.ln_2.bias => tensor([-0.0703, -0.8203, -0.3223, ..., -0.0197, 0.1689, 0.6367],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.mlp.c_fc.weight => tensor([[ 42, 12, -19, ..., -54, 5, 53],\n", " [ 59, -1, -29, ..., -37, 26, -14],\n", " [ 6, 30, -6, ..., -62, -4, -21],\n", " ...,\n", " [-59, 84, 0, ..., -14, 22, 39],\n", " [-10, -23, 18, ..., -32, -51, 93],\n", " [ -8, 32, 42, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.20.mlp.c_fc.bias => tensor([ 1.2734, -0.7812, 0.7188, ..., 0.5234, -0.3730, -0.3359],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.mlp.c_fc.SCB => tensor([0.1816, 0.1162, 0.1504, ..., 0.1553, 0.1309, 0.1738], device='cuda:0')\n", "transformer.h.20.mlp.c_fc.weight_format => col_turing\n", "transformer.h.20.mlp.c_fc2.weight => tensor([[ -5, 23, 49, ..., -12, 16, 26],\n", " [-18, 16, -5, ..., 23, 10, 15],\n", " [ -7, 4, 30, ..., -37, 26, 23],\n", " ...,\n", " [-22, 0, -6, ..., 9, -27, -20],\n", " [-18, 1, 15, ..., -16, -15, -34],\n", " [-28, 11, 7, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.20.mlp.c_fc2.bias => tensor([-2.0156, -0.9492, -1.6719, ..., -2.3594, -1.9141, -1.8359],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.mlp.c_fc2.SCB => tensor([0.1465, 0.0986, 0.1445, ..., 0.1206, 0.1050, 0.1143], device='cuda:0')\n", "transformer.h.20.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.20.mlp.c_proj.weight => tensor([[-10, 12, -3, ..., -52, 15, 32],\n", " [-13, 6, -23, ..., -2, -26, -28],\n", " [ 2, 5, -21, ..., 0, 32, -11],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 24, -12, 42, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.20.mlp.c_proj.bias => tensor([ 0.4629, 1.6484, 0.0869, ..., 0.1660, -1.2188, -1.8672],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.20.mlp.c_proj.SCB => tensor([0.1436, 0.1338, 0.1553, ..., 0.1445, 0.1562, 0.1602], device='cuda:0')\n", "transformer.h.20.mlp.c_proj.weight_format => col_turing\n", "transformer.h.21.ln_1.weight => tensor([2.9375, 2.5781, 2.8125, ..., 2.8125, 2.8438, 2.4844], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.21.ln_1.bias => tensor([-0.0854, -1.1328, -0.5117, ..., 0.0267, 0.1089, 0.7695],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.attn.c_attn.weight => tensor([[ 27, 18, -50, ..., -35, -44, 16],\n", " [ -37, -23, -46, ..., -28, 19, 20],\n", " [ -20, 20, 4, ..., -42, 4, 47],\n", " ...,\n", " [ -37, -10, 61, ..., 8, -41, 17],\n", " [ -9, -54, 10, ..., -4, -10, -13],\n", " [-107, 35, -23, ..., -18, -27, -4]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.21.attn.c_attn.bias => tensor([-0.5977, -0.1094, -0.3672, ..., 0.0762, 0.4785, 0.3125],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.attn.c_attn.SCB => tensor([0.1133, 0.0938, 0.1289, ..., 0.1689, 0.1436, 0.1514], device='cuda:0')\n", "transformer.h.21.attn.c_attn.weight_format => col_turing\n", "transformer.h.21.attn.c_proj.weight => tensor([[-34, -9, -24, ..., 7, 8, -8],\n", " [-32, -14, 27, ..., -6, -18, 6],\n", " [ 45, 43, -17, ..., 23, 31, 21],\n", " ...,\n", " [ 12, 3, -12, ..., 3, -43, 45],\n", " [ 15, 2, 20, ..., 18, 25, -6],\n", " [ 7, -28, -20, ..., -27, -2, 10]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.21.attn.c_proj.bias => tensor([ 0.8984, 1.0234, 0.4277, ..., 0.4062, -0.8711, -1.7031],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.attn.c_proj.SCB => tensor([0.1699, 0.1768, 0.1611, ..., 0.1445, 0.1465, 0.1855], device='cuda:0')\n", "transformer.h.21.attn.c_proj.weight_format => col_turing\n", "transformer.h.21.ln_2.weight => tensor([1.6172, 1.7422, 1.7031, ..., 1.7188, 1.7188, 1.8516], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.21.ln_2.bias => tensor([-0.0840, -0.7812, -0.2715, ..., -0.0190, 0.1348, 0.7422],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.mlp.c_fc.weight => tensor([[ 9, 2, -42, ..., -25, 11, -14],\n", " [ 6, -36, -25, ..., 16, 9, -7],\n", " [-14, 14, -42, ..., 0, 28, 13],\n", " ...,\n", " [-37, -25, 62, ..., 16, -50, 12],\n", " [ 7, 39, -15, ..., -49, 13, -56],\n", " [-72, -16, 18, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.21.mlp.c_fc.bias => tensor([ 0.1357, -0.7109, -0.0046, ..., -0.6953, -0.0011, 0.1011],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.mlp.c_fc.SCB => tensor([0.1650, 0.1191, 0.1289, ..., 0.1167, 0.1416, 0.1187], device='cuda:0')\n", "transformer.h.21.mlp.c_fc.weight_format => col_turing\n", "transformer.h.21.mlp.c_fc2.weight => tensor([[-58, -13, 31, ..., 2, -28, 53],\n", " [-11, -19, -25, ..., -51, -9, 27],\n", " [ -2, 3, -3, ..., -12, 13, -71],\n", " ...,\n", " [ 21, -64, -27, ..., 7, -52, -25],\n", " [ 24, 43, 15, ..., 53, 9, 18],\n", " [-25, 8, 14, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.21.mlp.c_fc2.bias => tensor([-4.1875, -1.4141, -2.2188, ..., -1.5312, -2.3125, -1.8672],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.mlp.c_fc2.SCB => tensor([0.1357, 0.1099, 0.1143, ..., 0.0991, 0.1138, 0.0952], device='cuda:0')\n", "transformer.h.21.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.21.mlp.c_proj.weight => tensor([[ 7, -3, 19, ..., 23, 11, 1],\n", " [-27, 4, 19, ..., 25, -10, -34],\n", " [-18, 13, -3, ..., 34, 45, -22],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-56, -22, 16, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.21.mlp.c_proj.bias => tensor([ 1.0547, 1.1562, 0.3262, ..., 0.4648, -0.8789, -1.8125],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.21.mlp.c_proj.SCB => tensor([0.1504, 0.1514, 0.1475, ..., 0.1445, 0.1270, 0.1533], device='cuda:0')\n", "transformer.h.21.mlp.c_proj.weight_format => col_turing\n", "transformer.h.22.ln_1.weight => tensor([3.2969, 2.7031, 2.9531, ..., 2.9219, 2.9219, 2.4531], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.22.ln_1.bias => tensor([-0.0908, -1.0938, -0.4590, ..., -0.0391, 0.1240, 0.7070],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.attn.c_attn.weight => tensor([[ 10, -77, -14, ..., -6, 12, -1],\n", " [ 14, 26, -9, ..., -16, 18, -48],\n", " [ 22, -12, 1, ..., -70, 27, 48],\n", " ...,\n", " [ 2, -8, -7, ..., 1, -34, -3],\n", " [-31, 15, -16, ..., 61, 27, -46],\n", " [ 26, -40, 18, ..., -47, -12, -22]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.22.attn.c_attn.bias => tensor([ 0.8633, 0.0698, -0.1689, ..., -0.4688, 0.0645, 0.0781],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.attn.c_attn.SCB => tensor([0.1191, 0.1270, 0.1152, ..., 0.2207, 0.2793, 0.2412], device='cuda:0')\n", "transformer.h.22.attn.c_attn.weight_format => col_turing\n", "transformer.h.22.attn.c_proj.weight => tensor([[ 35, 34, 5, ..., 0, -25, 15],\n", " [ 36, -12, -8, ..., -41, -28, -45],\n", " [ 5, -35, 11, ..., 26, 9, 46],\n", " ...,\n", " [ 22, -43, -27, ..., -76, 40, 44],\n", " [-70, 9, 15, ..., 40, 32, 32],\n", " [-28, -4, 60, ..., 11, -56, 27]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.22.attn.c_proj.bias => tensor([ 1.0469, 1.1875, 0.1621, ..., 0.6484, -0.8789, -1.3750],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.attn.c_proj.SCB => tensor([0.1475, 0.1289, 0.1445, ..., 0.1582, 0.1436, 0.1719], device='cuda:0')\n", "transformer.h.22.attn.c_proj.weight_format => col_turing\n", "transformer.h.22.ln_2.weight => tensor([1.7344, 1.8594, 1.8203, ..., 1.8516, 1.8438, 1.9453], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.22.ln_2.bias => tensor([-0.1377, -0.7539, -0.1963, ..., -0.0430, 0.1543, 0.7383],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.mlp.c_fc.weight => tensor([[ 28, -24, -49, ..., 82, -9, -8],\n", " [ 37, 75, 1, ..., 16, -17, 85],\n", " [ 18, -4, 53, ..., 2, 17, -43],\n", " ...,\n", " [ -8, -8, -10, ..., 22, 23, -7],\n", " [ -3, -43, 19, ..., -49, 11, -25],\n", " [-40, 7, 11, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.22.mlp.c_fc.bias => tensor([-1.2360e-03, -1.0391e+00, 1.8945e-01, ..., -8.9453e-01,\n", " 2.6953e-01, -1.4922e+00], device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.mlp.c_fc.SCB => tensor([0.1396, 0.1108, 0.1338, ..., 0.1279, 0.1777, 0.1260], device='cuda:0')\n", "transformer.h.22.mlp.c_fc.weight_format => col_turing\n", "transformer.h.22.mlp.c_fc2.weight => tensor([[ 25, -29, 19, ..., 54, 41, -14],\n", " [-26, 18, -3, ..., 17, 12, -15],\n", " [-21, 49, 13, ..., 14, 5, 32],\n", " ...,\n", " [-24, -18, -15, ..., 9, 58, -74],\n", " [ 42, -42, 63, ..., 13, 4, 11],\n", " [ -6, -17, -22, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.22.mlp.c_fc2.bias => tensor([-3.2969, -1.2891, -2.5156, ..., -1.9375, -2.4688, -2.0156],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.mlp.c_fc2.SCB => tensor([0.1299, 0.1138, 0.1328, ..., 0.1338, 0.1040, 0.0991], device='cuda:0')\n", "transformer.h.22.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.22.mlp.c_proj.weight => tensor([[ -3, 3, -9, ..., 24, 34, 34],\n", " [ 24, 19, 3, ..., -13, -29, -7],\n", " [ -4, 7, 2, ..., -38, 10, -18],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 35, 22, -15, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.22.mlp.c_proj.bias => tensor([ 0.5234, 0.8555, -0.0288, ..., 0.6680, -0.6484, -1.5547],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.22.mlp.c_proj.SCB => tensor([0.1348, 0.1426, 0.1592, ..., 0.1514, 0.1406, 0.1426], device='cuda:0')\n", "transformer.h.22.mlp.c_proj.weight_format => col_turing\n", "transformer.h.23.ln_1.weight => tensor([3.4375, 2.7500, 2.9531, ..., 2.9531, 2.9688, 2.4688], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.23.ln_1.bias => tensor([-0.0986, -1.1172, -0.4062, ..., -0.0664, 0.1436, 0.7227],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.attn.c_attn.weight => tensor([[ 33, 25, -5, ..., 26, 26, 7],\n", " [ 1, 10, 1, ..., 12, 45, 50],\n", " [ -6, -52, 16, ..., 23, -9, 15],\n", " ...,\n", " [-14, 7, 8, ..., -59, -12, 4],\n", " [-10, -21, 19, ..., 32, 57, 42],\n", " [ 14, -26, -16, ..., 41, 18, 23]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.23.attn.c_attn.bias => tensor([ 0.5195, 0.1377, -0.5234, ..., -0.2949, -0.0532, -0.3809],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.attn.c_attn.SCB => tensor([0.1064, 0.1138, 0.1147, ..., 0.1924, 0.1533, 0.1729], device='cuda:0')\n", "transformer.h.23.attn.c_attn.weight_format => col_turing\n", "transformer.h.23.attn.c_proj.weight => tensor([[-46, 1, 3, ..., -19, -8, -40],\n", " [ 35, 2, 32, ..., 17, 5, -19],\n", " [-31, 5, 1, ..., 25, -12, 32],\n", " ...,\n", " [ 34, 1, -37, ..., -7, -9, -25],\n", " [ 40, -43, 102, ..., 6, 26, -8],\n", " [-18, -13, 76, ..., -13, 76, -44]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.23.attn.c_proj.bias => tensor([ 0.7109, 0.7461, 0.3867, ..., 0.7578, -0.8828, -1.3984],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.attn.c_proj.SCB => tensor([0.1270, 0.1924, 0.1660, ..., 0.1416, 0.1777, 0.1445], device='cuda:0')\n", "transformer.h.23.attn.c_proj.weight_format => col_turing\n", "transformer.h.23.ln_2.weight => tensor([1.8438, 1.9453, 1.9531, ..., 1.9297, 1.9219, 1.9766], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.23.ln_2.bias => tensor([-0.1914, -0.7734, -0.2109, ..., -0.0874, 0.1289, 0.7617],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.mlp.c_fc.weight => tensor([[-67, -32, 4, ..., 3, 5, 4],\n", " [ 29, 62, -10, ..., 10, 23, 15],\n", " [ 11, 11, 27, ..., -61, 44, 16],\n", " ...,\n", " [ 3, 54, 9, ..., -32, 32, 12],\n", " [-40, 14, -17, ..., -61, 37, -16],\n", " [ 29, 4, -23, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.23.mlp.c_fc.bias => tensor([ 0.1147, -3.5156, 0.1562, ..., -0.8242, 0.2275, 0.5195],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.mlp.c_fc.SCB => tensor([0.1196, 0.2715, 0.1309, ..., 0.1270, 0.1318, 0.1191], device='cuda:0')\n", "transformer.h.23.mlp.c_fc.weight_format => col_turing\n", "transformer.h.23.mlp.c_fc2.weight => tensor([[ -39, -14, -47, ..., 10, -36, -101],\n", " [ -14, -20, 31, ..., -1, -10, -18],\n", " [ -12, -37, 39, ..., 69, -29, -2],\n", " ...,\n", " [ -8, -23, -12, ..., 28, -28, 0],\n", " [ 35, 6, -11, ..., -18, 22, -30],\n", " [ -59, 13, 23, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.23.mlp.c_fc2.bias => tensor([-2.5469, -2.5781, -1.7500, ..., -2.3594, -1.4531, -2.7344],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.mlp.c_fc2.SCB => tensor([0.1387, 0.1338, 0.1299, ..., 0.1123, 0.1089, 0.1069], device='cuda:0')\n", "transformer.h.23.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.23.mlp.c_proj.weight => tensor([[ 3, -26, 55, ..., -39, -1, -16],\n", " [ -22, 30, -54, ..., -7, -7, 61],\n", " [ 7, -9, 16, ..., -16, -4, 1],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-101, 39, -58, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.23.mlp.c_proj.bias => tensor([ 0.3125, 0.9688, 0.4785, ..., 0.2812, -0.4746, -1.1328],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.23.mlp.c_proj.SCB => tensor([0.1260, 0.1328, 0.1543, ..., 0.1426, 0.1465, 0.1602], device='cuda:0')\n", "transformer.h.23.mlp.c_proj.weight_format => col_turing\n", "transformer.h.24.ln_1.weight => tensor([3.5625, 2.8281, 3.0625, ..., 3.0781, 3.2500, 2.5312], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.24.ln_1.bias => tensor([-0.1670, -1.1016, -0.3223, ..., -0.1001, 0.2070, 0.7305],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.attn.c_attn.weight => tensor([[ -6, -1, -35, ..., 39, 49, 6],\n", " [-31, -24, -27, ..., 23, -39, 18],\n", " [-34, 38, 24, ..., 17, 23, 43],\n", " ...,\n", " [-10, -4, -28, ..., 13, 13, 57],\n", " [ 73, -54, -29, ..., 22, -35, 36],\n", " [ 31, 34, 37, ..., -56, 9, -47]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.24.attn.c_attn.bias => tensor([-0.4004, -0.6094, -0.6172, ..., -0.1436, 0.1738, 0.2129],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.attn.c_attn.SCB => tensor([0.1147, 0.1089, 0.1182, ..., 0.1543, 0.1650, 0.1611], device='cuda:0')\n", "transformer.h.24.attn.c_attn.weight_format => col_turing\n", "transformer.h.24.attn.c_proj.weight => tensor([[ 13, -16, -20, ..., -11, -43, 20],\n", " [-36, 20, 4, ..., 6, 13, 4],\n", " [ 3, 22, 3, ..., 10, 18, 1],\n", " ...,\n", " [ -9, 33, 8, ..., -54, 54, -2],\n", " [-89, -33, -45, ..., 46, 8, 17],\n", " [-29, -2, -1, ..., 38, 82, -47]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.24.attn.c_proj.bias => tensor([ 0.7188, 0.8945, 0.2559, ..., 0.0364, -0.7969, -1.2812],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.attn.c_proj.SCB => tensor([0.1533, 0.1816, 0.1748, ..., 0.1309, 0.1768, 0.1475], device='cuda:0')\n", "transformer.h.24.attn.c_proj.weight_format => col_turing\n", "transformer.h.24.ln_2.weight => tensor([1.9297, 2.0156, 2.0156, ..., 2.0312, 2.0312, 2.0312], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.24.ln_2.bias => tensor([-0.1621, -0.8477, -0.2383, ..., -0.0408, 0.1562, 0.8203],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.mlp.c_fc.weight => tensor([[ 74, 59, 2, ..., -5, 11, -59],\n", " [ 51, 39, -14, ..., -9, -55, 9],\n", " [ 30, -5, -6, ..., 20, 40, -17],\n", " ...,\n", " [-33, -3, -41, ..., -14, -41, -21],\n", " [ 24, -9, -39, ..., 12, 5, -51],\n", " [ 17, 5, 43, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.24.mlp.c_fc.bias => tensor([-0.2100, -2.7969, 0.3594, ..., 0.3828, -0.4824, -0.3320],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.mlp.c_fc.SCB => tensor([0.1299, 0.2041, 0.1455, ..., 0.1152, 0.1221, 0.1631], device='cuda:0')\n", "transformer.h.24.mlp.c_fc.weight_format => col_turing\n", "transformer.h.24.mlp.c_fc2.weight => tensor([[-61, -42, -58, ..., -25, -51, 97],\n", " [-49, -36, 2, ..., -32, 1, -29],\n", " [ 21, -22, -1, ..., -47, -36, -10],\n", " ...,\n", " [ 59, 12, -3, ..., 48, 2, 40],\n", " [ 43, 16, 24, ..., 13, 19, -69],\n", " [-18, 31, -75, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.24.mlp.c_fc2.bias => tensor([-2.0938, -1.4141, -1.8281, ..., -1.4062, -1.5078, -2.1562],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.mlp.c_fc2.SCB => tensor([0.1147, 0.1270, 0.1016, ..., 0.1060, 0.1045, 0.1279], device='cuda:0')\n", "transformer.h.24.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.24.mlp.c_proj.weight => tensor([[-51, -17, -34, ..., 10, -31, 32],\n", " [ 27, -34, 10, ..., 28, -49, 44],\n", " [ 31, -49, -1, ..., -66, 59, 1],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 64, 13, -39, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.24.mlp.c_proj.bias => tensor([ 0.4219, 0.6602, -0.1289, ..., 0.0493, -0.2354, -0.8164],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.24.mlp.c_proj.SCB => tensor([0.1416, 0.1377, 0.1309, ..., 0.1465, 0.1523, 0.1445], device='cuda:0')\n", "transformer.h.24.mlp.c_proj.weight_format => col_turing\n", "transformer.h.25.ln_1.weight => tensor([3.5156, 2.8750, 3.0000, ..., 2.9844, 2.9844, 2.5469], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.25.ln_1.bias => tensor([-0.1377, -1.0625, -0.2695, ..., -0.1191, 0.1865, 0.7305],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.attn.c_attn.weight => tensor([[-29, 27, -26, ..., 3, 28, -22],\n", " [-40, 43, -45, ..., 43, -3, -16],\n", " [-32, 4, 40, ..., -32, -3, 31],\n", " ...,\n", " [ -7, -12, -44, ..., 16, -23, 2],\n", " [ -2, 7, -22, ..., -48, -23, -38],\n", " [ 24, -38, -49, ..., 42, -9, 73]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.25.attn.c_attn.bias => tensor([ 0.3105, -0.4238, -0.5938, ..., -0.4160, 0.0299, 0.1660],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.attn.c_attn.SCB => tensor([0.1147, 0.1030, 0.1084, ..., 0.1289, 0.1650, 0.1777], device='cuda:0')\n", "transformer.h.25.attn.c_attn.weight_format => col_turing\n", "transformer.h.25.attn.c_proj.weight => tensor([[-14, 36, -67, ..., 5, -2, 10],\n", " [ 13, 18, 5, ..., -14, -29, -10],\n", " [-13, -9, 8, ..., -31, -19, 9],\n", " ...,\n", " [ 24, 16, -14, ..., 35, 34, -80],\n", " [ 32, -46, 4, ..., -35, -54, -6],\n", " [ 10, -16, 15, ..., 84, -25, -28]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.25.attn.c_proj.bias => tensor([ 0.4727, 0.3086, -0.1035, ..., 0.3164, -0.4004, -0.4258],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.attn.c_proj.SCB => tensor([0.1445, 0.1660, 0.1416, ..., 0.1484, 0.1660, 0.1729], device='cuda:0')\n", "transformer.h.25.attn.c_proj.weight_format => col_turing\n", "transformer.h.25.ln_2.weight => tensor([2.0625, 2.0781, 2.0938, ..., 2.0625, 2.1406, 2.1094], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.25.ln_2.bias => tensor([-0.1680, -0.8008, -0.2158, ..., 0.0024, 0.0859, 0.6992],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.mlp.c_fc.weight => tensor([[ -14, 25, -22, ..., 38, -56, -58],\n", " [ -3, -22, -2, ..., -13, -30, -26],\n", " [ 3, 75, 66, ..., -18, -33, 30],\n", " ...,\n", " [ 31, -63, 28, ..., -22, -5, 6],\n", " [ 35, -8, -80, ..., 30, -59, -4],\n", " [ -4, 39, -100, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.25.mlp.c_fc.bias => tensor([-0.1177, -3.1250, 0.3008, ..., 0.0947, -0.7539, -0.0981],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.mlp.c_fc.SCB => tensor([0.1377, 0.1230, 0.1133, ..., 0.1177, 0.1133, 0.1406], device='cuda:0')\n", "transformer.h.25.mlp.c_fc.weight_format => col_turing\n", "transformer.h.25.mlp.c_fc2.weight => tensor([[-100, 7, 18, ..., -45, 10, -18],\n", " [ -2, 52, 36, ..., 22, -42, 29],\n", " [ 51, 34, 22, ..., 5, 58, -3],\n", " ...,\n", " [ -7, 9, -36, ..., 55, 12, -44],\n", " [ 58, -5, -4, ..., 38, -24, 16],\n", " [ 33, -23, -5, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.25.mlp.c_fc2.bias => tensor([-2.2031, -0.4395, -1.5000, ..., -2.1562, -1.7500, -2.4688],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.mlp.c_fc2.SCB => tensor([0.1040, 0.1182, 0.1094, ..., 0.1201, 0.1201, 0.1206], device='cuda:0')\n", "transformer.h.25.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.25.mlp.c_proj.weight => tensor([[-17, -37, -1, ..., -20, 4, 33],\n", " [ 6, -31, 36, ..., -16, 24, 22],\n", " [-11, -2, -30, ..., -27, 11, 16],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 7, -56, -2, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.25.mlp.c_proj.bias => tensor([ 0.2578, 0.7461, -0.4531, ..., 0.5273, -0.3398, -0.5117],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.25.mlp.c_proj.SCB => tensor([0.1338, 0.1357, 0.1338, ..., 0.1377, 0.1289, 0.1348], device='cuda:0')\n", "transformer.h.25.mlp.c_proj.weight_format => col_turing\n", "transformer.h.26.ln_1.weight => tensor([3.9219, 3.0000, 3.2500, ..., 3.2344, 3.2969, 2.5781], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.26.ln_1.bias => tensor([-0.1270, -1.0703, -0.2637, ..., -0.1367, 0.2383, 0.6016],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.attn.c_attn.weight => tensor([[-34, -21, -8, ..., -73, 7, 6],\n", " [-46, -48, -38, ..., 5, -56, -30],\n", " [ 19, 9, -16, ..., 34, 15, -14],\n", " ...,\n", " [-55, -35, 14, ..., 53, 51, -12],\n", " [ 32, 5, -28, ..., -3, 41, 9],\n", " [-19, -34, 8, ..., -17, -46, -9]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.26.attn.c_attn.bias => tensor([ 0.0554, -0.6719, -0.2148, ..., 0.1240, -0.3496, 0.2715],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.attn.c_attn.SCB => tensor([0.1045, 0.1060, 0.1289, ..., 0.1562, 0.1572, 0.1846], device='cuda:0')\n", "transformer.h.26.attn.c_attn.weight_format => col_turing\n", "transformer.h.26.attn.c_proj.weight => tensor([[-32, 15, 6, ..., 59, 36, -6],\n", " [-15, -25, -14, ..., -19, 5, 53],\n", " [-14, 2, -21, ..., -27, 24, 23],\n", " ...,\n", " [ 40, 13, -11, ..., 22, -27, -5],\n", " [ 2, 13, -80, ..., 33, 2, 67],\n", " [-22, -17, 27, ..., 28, -9, -17]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.26.attn.c_proj.bias => tensor([ 0.9219, 0.0540, 0.0112, ..., 0.1953, -0.6094, -0.4023],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.attn.c_proj.SCB => tensor([0.1602, 0.1748, 0.1826, ..., 0.1621, 0.1465, 0.1660], device='cuda:0')\n", "transformer.h.26.attn.c_proj.weight_format => col_turing\n", "transformer.h.26.ln_2.weight => tensor([2.1562, 2.1406, 2.1406, ..., 2.1406, 2.1094, 2.2031], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.26.ln_2.bias => tensor([-0.1562, -0.7344, -0.1582, ..., -0.0260, 0.1348, 0.6094],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.mlp.c_fc.weight => tensor([[-37, 16, 33, ..., -62, -14, 32],\n", " [-13, -34, -8, ..., -6, 40, 1],\n", " [ 20, -13, 4, ..., 5, 51, -9],\n", " ...,\n", " [ 7, -28, -28, ..., 2, -43, 13],\n", " [-26, -76, -6, ..., -74, -12, -49],\n", " [ 20, -66, -30, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.26.mlp.c_fc.bias => tensor([ 0.6250, 0.6523, 0.1152, ..., 0.7539, -3.9844, 0.8164],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.mlp.c_fc.SCB => tensor([0.1553, 0.2295, 0.1289, ..., 0.1084, 0.1484, 0.1963], device='cuda:0')\n", "transformer.h.26.mlp.c_fc.weight_format => col_turing\n", "transformer.h.26.mlp.c_fc2.weight => tensor([[-18, 0, 26, ..., -10, 9, 21],\n", " [ 20, -42, 1, ..., -42, -20, 4],\n", " [ 3, 36, 17, ..., 36, -48, 34],\n", " ...,\n", " [ 6, 11, 7, ..., -8, -44, -45],\n", " [ 18, 16, 4, ..., 30, 16, -7],\n", " [-21, -12, 3, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.26.mlp.c_fc2.bias => tensor([-1.7812, 0.4082, -1.8516, ..., -1.0625, -1.2266, -0.3809],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.mlp.c_fc2.SCB => tensor([0.1147, 0.1475, 0.1484, ..., 0.1235, 0.1543, 0.1484], device='cuda:0')\n", "transformer.h.26.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.26.mlp.c_proj.weight => tensor([[ -3, 27, -22, ..., -64, 52, -6],\n", " [ 2, -23, -46, ..., -43, -12, -44],\n", " [ 20, -21, -3, ..., 54, 9, 19],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 16, -27, -19, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.26.mlp.c_proj.bias => tensor([ 0.8906, -0.0562, -0.4180, ..., 0.0854, -0.8594, -0.5625],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.26.mlp.c_proj.SCB => tensor([0.1387, 0.1318, 0.1338, ..., 0.1494, 0.1406, 0.1494], device='cuda:0')\n", "transformer.h.26.mlp.c_proj.weight_format => col_turing\n", "transformer.h.27.ln_1.weight => tensor([4.0312, 3.0625, 3.3125, ..., 3.3281, 3.3906, 2.7031], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.27.ln_1.bias => tensor([-0.2354, -1.0391, -0.1650, ..., -0.1895, 0.2852, 0.6016],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.attn.c_attn.weight => tensor([[ 107, 17, 24, ..., 54, 47, 41],\n", " [ -30, 51, -1, ..., -19, 0, -39],\n", " [ 30, 28, -117, ..., 22, -14, 84],\n", " ...,\n", " [ -9, -22, -9, ..., 58, -29, 12],\n", " [ 8, 10, 68, ..., 28, -57, -10],\n", " [ 35, 23, 62, ..., -24, 12, 57]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.27.attn.c_attn.bias => tensor([-0.6172, -0.0830, -0.4844, ..., -0.4141, 0.1396, -0.5312],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.attn.c_attn.SCB => tensor([0.1206, 0.1079, 0.1035, ..., 0.1494, 0.1660, 0.1719], device='cuda:0')\n", "transformer.h.27.attn.c_attn.weight_format => col_turing\n", "transformer.h.27.attn.c_proj.weight => tensor([[-14, 10, 11, ..., 27, 46, -52],\n", " [-37, -28, -34, ..., 16, -45, -18],\n", " [ 15, -38, -50, ..., 38, 20, -3],\n", " ...,\n", " [-41, 29, 2, ..., -14, 1, -27],\n", " [-54, 40, -29, ..., 35, -55, 19],\n", " [ 2, 30, 9, ..., -10, -68, -11]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.27.attn.c_proj.bias => tensor([ 1.5234, -0.2930, -0.4395, ..., 0.1553, -1.1641, -0.3438],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.attn.c_proj.SCB => tensor([0.1748, 0.1719, 0.1680, ..., 0.1826, 0.1611, 0.1367], device='cuda:0')\n", "transformer.h.27.attn.c_proj.weight_format => col_turing\n", "transformer.h.27.ln_2.weight => tensor([2.1875, 2.1719, 2.1875, ..., 2.2344, 2.2031, 2.2188], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.27.ln_2.bias => tensor([-0.2656, -0.7695, 0.0048, ..., -0.0415, 0.3184, 0.6016],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.mlp.c_fc.weight => tensor([[-17, 10, 3, ..., -20, -4, 7],\n", " [ 36, 5, -14, ..., -18, 6, 40],\n", " [-20, -9, -7, ..., 24, 8, -39],\n", " ...,\n", " [ -4, 30, -2, ..., -19, 55, 44],\n", " [ -2, 1, -7, ..., 14, 8, 71],\n", " [ 39, 54, 31, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.27.mlp.c_fc.bias => tensor([ 0.4883, -1.4141, 0.3301, ..., 0.2344, -1.3203, 0.2139],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.mlp.c_fc.SCB => tensor([0.1377, 0.1250, 0.1299, ..., 0.1196, 0.1729, 0.1377], device='cuda:0')\n", "transformer.h.27.mlp.c_fc.weight_format => col_turing\n", "transformer.h.27.mlp.c_fc2.weight => tensor([[ -54, -33, 19, ..., -30, -32, 50],\n", " [ -17, -14, 2, ..., -26, -62, -1],\n", " [ 25, -20, -19, ..., -71, 1, -14],\n", " ...,\n", " [ -21, -42, -10, ..., -17, 21, 39],\n", " [ 16, 23, -41, ..., -58, -16, -22],\n", " [ 53, 70, -103, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.27.mlp.c_fc2.bias => tensor([-1.5625, -0.7500, -1.0781, ..., -1.8438, -0.7734, -1.9219],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.mlp.c_fc2.SCB => tensor([0.1064, 0.1289, 0.1123, ..., 0.1250, 0.1001, 0.1250], device='cuda:0')\n", "transformer.h.27.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.27.mlp.c_proj.weight => tensor([[ 14, 31, 43, ..., -36, -13, -20],\n", " [-21, -5, 40, ..., 28, -68, -20],\n", " [ 50, 10, 11, ..., -49, -8, -23],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-45, -40, 44, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.27.mlp.c_proj.bias => tensor([ 1.2500, -0.4141, -0.7422, ..., 0.1943, -0.7461, -0.3965],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.27.mlp.c_proj.SCB => tensor([0.1426, 0.1621, 0.1377, ..., 0.1348, 0.1367, 0.1309], device='cuda:0')\n", "transformer.h.27.mlp.c_proj.weight_format => col_turing\n", "transformer.h.28.ln_1.weight => tensor([4.1875, 3.3281, 3.4219, ..., 3.6562, 3.6250, 2.9219], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.28.ln_1.bias => tensor([-0.3750, -0.8750, -0.1836, ..., -0.2773, 0.3984, 0.6914],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.attn.c_attn.weight => tensor([[-22, 1, -25, ..., -6, 23, 27],\n", " [ -2, 53, -1, ..., 15, 78, 3],\n", " [-48, -40, -16, ..., -38, 22, 18],\n", " ...,\n", " [-19, -5, 1, ..., -30, 61, 30],\n", " [ -4, -39, 34, ..., 27, -4, 8],\n", " [-11, 34, -29, ..., 8, 44, 2]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.28.attn.c_attn.bias => tensor([-0.4961, 0.6328, -0.5898, ..., 0.0113, 0.5234, -0.7031],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.attn.c_attn.SCB => tensor([0.0918, 0.0918, 0.0977, ..., 0.1650, 0.2256, 0.1514], device='cuda:0')\n", "transformer.h.28.attn.c_attn.weight_format => col_turing\n", "transformer.h.28.attn.c_proj.weight => tensor([[-39, 62, -45, ..., -58, 25, -23],\n", " [ -8, 26, -16, ..., 38, 7, 1],\n", " [-20, -4, 13, ..., 8, -4, -45],\n", " ...,\n", " [-12, 10, -6, ..., 46, -51, 7],\n", " [ 17, 31, 9, ..., 12, 0, -17],\n", " [-53, 29, 40, ..., 9, 50, 33]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.28.attn.c_proj.bias => tensor([ 1.4219, -0.7969, -0.4102, ..., 0.1934, -0.6328, -0.2480],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.attn.c_proj.SCB => tensor([0.1455, 0.1738, 0.1670, ..., 0.1631, 0.1709, 0.1504], device='cuda:0')\n", "transformer.h.28.attn.c_proj.weight_format => col_turing\n", "transformer.h.28.ln_2.weight => tensor([2.2344, 2.2969, 2.2656, ..., 2.3125, 2.3125, 2.2812], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.28.ln_2.bias => tensor([-0.3613, -0.6680, 0.0625, ..., -0.0459, 0.3340, 0.4766],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.mlp.c_fc.weight => tensor([[-22, -31, -45, ..., -27, 55, 35],\n", " [ 14, 15, -63, ..., -32, -27, -13],\n", " [-22, -41, 11, ..., -62, -18, 21],\n", " ...,\n", " [ 30, 30, -13, ..., 67, -31, 17],\n", " [ 16, 31, 19, ..., 64, 16, 24],\n", " [-46, -10, -16, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.28.mlp.c_fc.bias => tensor([ 0.2109, -0.2969, 0.1357, ..., 0.2285, -0.1729, -0.1396],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.mlp.c_fc.SCB => tensor([0.1475, 0.1338, 0.1230, ..., 0.1445, 0.1367, 0.1289], device='cuda:0')\n", "transformer.h.28.mlp.c_fc.weight_format => col_turing\n", "transformer.h.28.mlp.c_fc2.weight => tensor([[ 43, -12, 36, ..., -4, -33, -16],\n", " [-24, 21, 35, ..., 6, -52, -28],\n", " [ -5, 30, -4, ..., -18, 37, -30],\n", " ...,\n", " [-37, -58, -4, ..., -31, 16, 0],\n", " [ 8, 7, -34, ..., -9, 21, -35],\n", " [ 4, 33, 25, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.28.mlp.c_fc2.bias => tensor([-1.9062, -0.3301, -1.3984, ..., -1.8281, -1.4453, -1.7891],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.mlp.c_fc2.SCB => tensor([0.1245, 0.0957, 0.1152, ..., 0.1191, 0.1064, 0.1123], device='cuda:0')\n", "transformer.h.28.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.28.mlp.c_proj.weight => tensor([[-81, 47, -2, ..., 14, 1, -25],\n", " [ 10, 5, 14, ..., -18, -2, 34],\n", " [-20, 48, -14, ..., 9, 37, 5],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [-59, -84, 53, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.28.mlp.c_proj.bias => tensor([ 1.5234, -0.2832, -0.1338, ..., 0.3477, -0.5781, -0.0306],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.28.mlp.c_proj.SCB => tensor([0.1406, 0.1475, 0.1387, ..., 0.1650, 0.1641, 0.1562], device='cuda:0')\n", "transformer.h.28.mlp.c_proj.weight_format => col_turing\n", "transformer.h.29.ln_1.weight => tensor([3.9688, 3.3125, 3.4219, ..., 3.5781, 3.6406, 3.0312], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.29.ln_1.bias => tensor([-0.4258, -0.7930, -0.0091, ..., -0.1582, 0.6172, 0.5469],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.attn.c_attn.weight => tensor([[ 19, 24, -39, ..., -28, 10, 23],\n", " [-28, -2, 26, ..., 11, -47, 23],\n", " [-51, 17, 24, ..., -15, -14, 23],\n", " ...,\n", " [ -4, 54, -35, ..., -30, 10, 67],\n", " [ 22, -18, 20, ..., 18, -29, 0],\n", " [ 22, 4, 13, ..., -23, 35, 4]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.29.attn.c_attn.bias => tensor([ 0.8984, -0.8164, -2.5312, ..., 0.0271, -0.0232, 0.5742],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.attn.c_attn.SCB => tensor([0.1021, 0.1011, 0.1113, ..., 0.1338, 0.1543, 0.1572], device='cuda:0')\n", "transformer.h.29.attn.c_attn.weight_format => col_turing\n", "transformer.h.29.attn.c_proj.weight => tensor([[ -8, 3, -40, ..., -28, 18, -20],\n", " [ 8, 13, 20, ..., -63, -29, 19],\n", " [-13, 42, -70, ..., 6, -14, 38],\n", " ...,\n", " [ 40, -24, 49, ..., 0, 29, 46],\n", " [ 4, -5, 15, ..., -40, -27, -10],\n", " [-14, 12, 9, ..., -8, 10, 40]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.29.attn.c_proj.bias => tensor([ 1.2031, -0.5664, -0.4355, ..., 0.4062, -0.3965, -0.2090],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.attn.c_proj.SCB => tensor([0.1689, 0.1494, 0.1406, ..., 0.1309, 0.1406, 0.1494], device='cuda:0')\n", "transformer.h.29.attn.c_proj.weight_format => col_turing\n", "transformer.h.29.ln_2.weight => tensor([2.3438, 2.3438, 2.3594, ..., 2.3750, 2.3594, 2.3594], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.29.ln_2.bias => tensor([-0.4062, -0.5508, 0.0952, ..., -0.0476, 0.3281, 0.4297],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.mlp.c_fc.weight => tensor([[-28, 35, -77, ..., 17, 35, 78],\n", " [-69, -17, 9, ..., 47, -17, -21],\n", " [-12, -44, -36, ..., -52, 49, 71],\n", " ...,\n", " [ 50, -13, -21, ..., 12, 19, -17],\n", " [-58, 42, 11, ..., 26, -12, 16],\n", " [ 21, 13, 16, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.29.mlp.c_fc.bias => tensor([ 0.2158, -0.6719, 0.2520, ..., -0.1846, 0.2090, 0.2910],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.mlp.c_fc.SCB => tensor([0.1182, 0.1299, 0.1338, ..., 0.1226, 0.1167, 0.1152], device='cuda:0')\n", "transformer.h.29.mlp.c_fc.weight_format => col_turing\n", "transformer.h.29.mlp.c_fc2.weight => tensor([[ -38, 21, 30, ..., -62, -24, 58],\n", " [-100, 59, 19, ..., -31, -50, 12],\n", " [ -2, 53, -1, ..., -19, -11, 18],\n", " ...,\n", " [ 10, -26, 60, ..., -35, -60, -6],\n", " [ -76, 21, 14, ..., -3, 20, 7],\n", " [ 17, 22, -42, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.29.mlp.c_fc2.bias => tensor([-1.3828, -0.5703, -0.9961, ..., -1.9219, -0.9766, -1.3203],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.mlp.c_fc2.SCB => tensor([0.1187, 0.0996, 0.1099, ..., 0.1128, 0.1099, 0.1084], device='cuda:0')\n", "transformer.h.29.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.29.mlp.c_proj.weight => tensor([[-18, 44, -36, ..., 4, -8, -64],\n", " [ 17, -11, 17, ..., -59, 7, 5],\n", " [ 80, 10, 26, ..., -57, 18, 24],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ -8, -18, 4, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.29.mlp.c_proj.bias => tensor([ 0.3398, -0.4414, -0.4668, ..., 0.2178, -0.3008, -0.1660],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.29.mlp.c_proj.SCB => tensor([0.1631, 0.1377, 0.1338, ..., 0.1338, 0.1387, 0.1484], device='cuda:0')\n", "transformer.h.29.mlp.c_proj.weight_format => col_turing\n", "transformer.h.30.ln_1.weight => tensor([4.0938, 3.6406, 3.8594, ..., 3.6875, 3.9062, 3.2656], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.30.ln_1.bias => tensor([-0.3477, -0.7930, -0.0114, ..., -0.3301, 0.4824, 0.6367],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.30.attn.c_attn.weight => tensor([[ 8, -8, 24, ..., -8, -15, 4],\n", " [-18, 51, 13, ..., 66, -8, -37],\n", " [-48, -14, 48, ..., 27, -32, 60],\n", " ...,\n", " [-40, 0, 4, ..., 0, -17, -48],\n", " [-33, 51, 13, ..., -43, -77, -2],\n", " [ 16, 35, -29, ..., -4, 72, -16]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.30.attn.c_attn.bias => tensor([ 1.1484, 0.7109, -0.6484, ..., -0.1021, -0.1689, -0.0566],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.30.attn.c_attn.SCB => tensor([0.1133, 0.0894, 0.1099, ..., 0.1982, 0.1533, 0.2070], device='cuda:0')\n", "transformer.h.30.attn.c_attn.weight_format => col_turing\n", "transformer.h.30.attn.c_proj.weight => tensor([[ -2, 30, -7, ..., 19, -5, 6],\n", " [ 18, 9, -38, ..., -6, -3, 10],\n", " [ 6, -10, 22, ..., 2, 16, 15],\n", " ...,\n", " [ 9, 16, -13, ..., 23, 2, -2],\n", " [-34, 20, -11, ..., 34, -12, -2],\n", " [ 33, 10, -33, ..., 67, -32, 6]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.30.attn.c_proj.bias => tensor([ 0.3906, -0.4414, -0.6406, ..., 0.4863, -0.3809, -0.3574],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.30.attn.c_proj.SCB => tensor([0.1641, 0.1729, 0.1475, ..., 0.1699, 0.1914, 0.1406], device='cuda:0')\n", "transformer.h.30.attn.c_proj.weight_format => col_turing\n", "transformer.h.30.ln_2.weight => tensor([2.4062, 2.3594, 2.3750, ..., 2.4062, 2.3438, 2.4375], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.30.ln_2.bias => tensor([-0.2910, -0.5039, -0.0723, ..., -0.1406, 0.5391, 0.6016],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.30.mlp.c_fc.weight => tensor([[-60, 49, -2, ..., -15, -8, 41],\n", " [ -2, 41, 19, ..., 31, 0, 116],\n", " [-19, 16, -15, ..., -7, -8, 19],\n", " ...,\n", " [ 31, -28, -63, ..., -21, 20, 18],\n", " [ 42, -49, -14, ..., 19, 57, -14],\n", " [-21, 18, 46, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.30.mlp.c_fc.bias => tensor([1.3438, 2.2500, 1.2734, ..., 1.8828, 0.1104, 3.2188], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.30.mlp.c_fc.SCB => tensor([0.1494, 0.1504, 0.1348, ..., 0.1406, 0.1201, 0.2256], device='cuda:0')\n", "transformer.h.30.mlp.c_fc.weight_format => col_turing\n", "transformer.h.30.mlp.c_fc2.weight => tensor([[ 24, -32, 39, ..., 35, -16, 26],\n", " [-24, 45, -15, ..., -40, 10, 16],\n", " [-43, -25, -25, ..., -76, -16, 13],\n", " ...,\n", " [ 1, -40, 72, ..., -33, 83, -31],\n", " [ -1, -28, 3, ..., 79, 2, 27],\n", " [ 46, 65, -15, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.30.mlp.c_fc2.bias => tensor([-4.0000, -1.8516, -1.1328, ..., -1.6719, -0.7422, -3.0156],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.30.mlp.c_fc2.SCB => tensor([0.1172, 0.1030, 0.0991, ..., 0.0962, 0.1104, 0.1455], device='cuda:0')\n", "transformer.h.30.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.30.mlp.c_proj.weight => tensor([[ 10, 31, 8, ..., 23, 36, 8],\n", " [-17, 9, -13, ..., -21, -31, 7],\n", " [ 0, -41, -16, ..., 32, -32, -8],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 5, -27, -23, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.30.mlp.c_proj.bias => tensor([-0.9805, -0.2852, -0.3398, ..., 0.3730, 0.3457, 0.0674],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.30.mlp.c_proj.SCB => tensor([0.1738, 0.1387, 0.1357, ..., 0.1416, 0.1514, 0.1406], device='cuda:0')\n", "transformer.h.30.mlp.c_proj.weight_format => col_turing\n", "transformer.h.31.ln_1.weight => tensor([3.5312, 3.1562, 3.3594, ..., 3.4219, 3.3125, 3.0156], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.31.ln_1.bias => tensor([-0.1592, -0.5352, 0.0977, ..., -0.1797, 0.4180, 0.7227],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.attn.c_attn.weight => tensor([[ 26, 24, -32, ..., -24, -15, -52],\n", " [ -3, 50, -30, ..., 23, 17, -41],\n", " [-62, -12, 22, ..., 63, 7, 64],\n", " ...,\n", " [ 1, -33, 7, ..., 3, -62, 13],\n", " [ 45, 29, 1, ..., 15, -11, 0],\n", " [-35, 24, 38, ..., 7, 15, -96]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.31.attn.c_attn.bias => tensor([-0.0337, 2.6250, 0.9219, ..., 0.1709, -0.8320, 0.0928],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.attn.c_attn.SCB => tensor([0.1172, 0.0913, 0.1367, ..., 0.1318, 0.1187, 0.1196], device='cuda:0')\n", "transformer.h.31.attn.c_attn.weight_format => col_turing\n", "transformer.h.31.attn.c_proj.weight => tensor([[-38, 8, -9, ..., 21, 0, -40],\n", " [-16, 22, 8, ..., 6, 42, 22],\n", " [ 21, 38, 26, ..., 52, 25, 20],\n", " ...,\n", " [ -6, -8, -22, ..., 6, -33, 51],\n", " [-26, 39, 16, ..., -38, -19, 11],\n", " [-14, 2, -14, ..., 12, 75, -32]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.31.attn.c_proj.bias => tensor([-0.5234, -0.1904, 0.1279, ..., 0.7969, 0.3262, -0.4473],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.attn.c_proj.SCB => tensor([0.1621, 0.1621, 0.1426, ..., 0.1572, 0.1367, 0.1309], device='cuda:0')\n", "transformer.h.31.attn.c_proj.weight_format => col_turing\n", "transformer.h.31.ln_2.weight => tensor([2.1562, 2.3125, 2.2500, ..., 2.3281, 2.2188, 2.3438], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.h.31.ln_2.bias => tensor([ 0.0317, -0.3359, 0.0957, ..., -0.2617, 0.2051, 0.3086],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.mlp.c_fc.weight => tensor([[ 79, 10, -29, ..., -42, -70, -7],\n", " [ 3, 46, 41, ..., -3, 10, -19],\n", " [ 0, -28, -34, ..., 3, -1, -28],\n", " ...,\n", " [ 22, -31, -20, ..., 11, -11, -26],\n", " [ 13, 60, -12, ..., 20, -7, -27],\n", " [ 16, 28, -20, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.31.mlp.c_fc.bias => tensor([-0.8828, 3.7969, 7.2188, ..., 2.3906, -6.4062, 3.0781],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.mlp.c_fc.SCB => tensor([0.1250, 0.1836, 0.2041, ..., 0.1416, 0.1758, 0.2734], device='cuda:0')\n", "transformer.h.31.mlp.c_fc.weight_format => col_turing\n", "transformer.h.31.mlp.c_fc2.weight => tensor([[-26, 22, 33, ..., -6, 29, -30],\n", " [ 40, 33, -29, ..., -50, -7, -21],\n", " [ 31, 0, -12, ..., -28, 5, -4],\n", " ...,\n", " [-47, -4, 52, ..., 0, -1, -23],\n", " [-18, 55, -16, ..., 31, 23, -3],\n", " [-37, 39, -36, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.31.mlp.c_fc2.bias => tensor([-6.3750, -2.6875, -2.3906, ..., -1.5391, -2.8750, -2.5781],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.mlp.c_fc2.SCB => tensor([0.1191, 0.1245, 0.2314, ..., 0.1079, 0.1973, 0.0991], device='cuda:0')\n", "transformer.h.31.mlp.c_fc2.weight_format => col_turing\n", "transformer.h.31.mlp.c_proj.weight => tensor([[-14, -7, 4, ..., -21, 5, 3],\n", " [-20, 15, 38, ..., -7, 32, -42],\n", " [-20, -1, 7, ..., -7, 2, 17],\n", " ...,\n", " [ 0, 0, 0, ..., 0, 0, 0],\n", " [ 44, -28, 24, ..., 0, 0, 0],\n", " [ 0, 0, 0, ..., 0, 0, 0]], device='cuda:0',\n", " dtype=torch.int8)\n", "transformer.h.31.mlp.c_proj.bias => tensor([-0.2793, -0.0566, -0.2539, ..., 0.3887, 0.1572, 1.1953],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.h.31.mlp.c_proj.SCB => tensor([0.1680, 0.1562, 0.1885, ..., 0.1738, 0.1426, 0.1416], device='cuda:0')\n", "transformer.h.31.mlp.c_proj.weight_format => col_turing\n", "transformer.ln_f.weight => tensor([3.6094, 3.5781, 3.5781, ..., 3.5156, 3.6562, 3.5938], device='cuda:0',\n", " dtype=torch.bfloat16)\n", "transformer.ln_f.bias => tensor([ 0.0508, -0.2422, -0.0815, ..., 0.0427, 0.0703, 0.5703],\n", " device='cuda:0', dtype=torch.bfloat16)\n", "transformer.relative_pe.slopes => tensor([[0.8398],\n", " [0.7070],\n", " [0.5938],\n", " [0.5000],\n", " [0.4199],\n", " [0.3535],\n", " [0.2969],\n", " [0.2500],\n", " [0.2100],\n", " [0.1768],\n", " [0.1484],\n", " [0.1250],\n", " [0.1050],\n", " [0.0884],\n", " [0.0742],\n", " [0.0625],\n", " [0.0525],\n", " [0.0442],\n", " [0.0371],\n", " [0.0312],\n", " [0.0262],\n", " [0.0221],\n", " [0.0186],\n", " [0.0156],\n", " [0.0131],\n", " [0.0110],\n", " [0.0093],\n", " [0.0078],\n", " [0.0066],\n", " [0.0055],\n", " [0.0046],\n", " [0.0039]], device='cuda:0', dtype=torch.bfloat16)\n", "lm_head.weight => tensor([[ 0.0200, 0.0442, 0.0562, ..., 0.0173, -0.0238, -0.0889],\n", " [-0.0259, 0.0170, -0.0221, ..., -0.0752, -0.0635, 0.0947],\n", " [-0.0276, 0.1846, 0.1533, ..., -0.0195, 0.0299, 0.0796],\n", " ...,\n", " [ 0.1182, 0.1523, 0.0742, ..., -0.1162, 0.0177, 0.0991],\n", " [ 0.0220, -0.0579, 0.0125, ..., -0.0576, 0.0327, 0.0211],\n", " [ 0.0508, -0.0217, 0.0278, ..., -0.0308, -0.0378, 0.0013]],\n", " device='cuda:0', dtype=torch.bfloat16)\n" ] } ] }, { "cell_type": "code", "source": [ "import sys\n", "import os\n", "import struct\n", "import json\n", "\n", "import torch\n", "from transformers import AutoConfig\n", "\n", "config = AutoConfig.from_pretrained(\"cerebras/btlm-3b-8k-base\", trust_remote_code=True)\n", "hparams = config.to_dict()\n", "fname_out = \"btlm-3b.ggml.bin\"\n", "\n", "print(json.dumps(hparams, indent=4, sort_keys=True))\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5OGkZ-a8hyui", "outputId": "ef706529-5bd8-4183-f43d-4c8d7eb44f23" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{\n", " \"_name_or_path\": \"cerebras/btlm-3b-8k-base\",\n", " \"activation_function\": \"swiglu\",\n", " \"add_cross_attention\": false,\n", " \"architectures\": [\n", " \"BTLMLMHeadModel\"\n", " ],\n", " \"attn_pdrop\": 0.0,\n", " \"auto_map\": {\n", " \"AutoConfig\": \"cerebras/btlm-3b-8k-base--configuration_btlm.BTLMConfig\",\n", " \"AutoModel\": \"cerebras/btlm-3b-8k-base--modeling_btlm.BTLMModel\",\n", " \"AutoModelForCausalLM\": \"cerebras/btlm-3b-8k-base--modeling_btlm.BTLMLMHeadModel\",\n", " \"AutoModelForQuestionAnswering\": \"cerebras/btlm-3b-8k-base--modeling_btlm.BTLMForQuestionAnswering\",\n", " \"AutoModelForSequenceClassification\": \"cerebras/btlm-3b-8k-base--modeling_btlm.BTLMForSequenceClassification\",\n", " \"AutoModelForTokenClassification\": \"cerebras/btlm-3b-8k-base--modeling_btlm.BTLMForTokenClassification\"\n", " },\n", " \"bad_words_ids\": null,\n", " \"begin_suppress_tokens\": null,\n", " \"bos_token_id\": 50256,\n", " \"chunk_size_feed_forward\": 0,\n", " \"cross_attention_hidden_size\": null,\n", " \"decoder_start_token_id\": null,\n", " \"diversity_penalty\": 0.0,\n", " \"do_sample\": false,\n", " \"early_stopping\": false,\n", " \"embd_pdrop\": 0.0,\n", " \"encoder_no_repeat_ngram_size\": 0,\n", " \"eos_token_id\": 50256,\n", " \"exponential_decay_length_penalty\": null,\n", " \"finetuning_task\": null,\n", " \"forced_bos_token_id\": null,\n", " \"forced_eos_token_id\": null,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\"\n", " },\n", " \"initializer_range\": 0.073,\n", " \"is_decoder\": false,\n", " \"is_encoder_decoder\": false,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1\n", " },\n", " \"layer_norm_epsilon\": 1e-05,\n", " \"length_penalty\": 1.0,\n", " \"max_length\": 20,\n", " \"min_length\": 0,\n", " \"model_type\": \"btlm\",\n", " \"mup_embeddings_scale\": 14.6,\n", " \"mup_output_alpha\": 2.22,\n", " \"mup_scale_qk_dot_by_d\": true,\n", " \"mup_width_scale\": 0.1,\n", " \"n_embd\": 2560,\n", " \"n_head\": 32,\n", " \"n_inner\": 6826,\n", " \"n_layer\": 32,\n", " \"n_positions\": 8192,\n", " \"no_repeat_ngram_size\": 0,\n", " \"num_beam_groups\": 1,\n", " \"num_beams\": 1,\n", " \"num_return_sequences\": 1,\n", " \"output_attentions\": false,\n", " \"output_hidden_states\": false,\n", " \"output_scores\": false,\n", " \"pad_token_id\": null,\n", " \"position_embedding_type\": \"alibi\",\n", " \"prefix\": null,\n", " \"problem_type\": null,\n", " \"pruned_heads\": {},\n", " \"remove_invalid_values\": false,\n", " \"reorder_and_upcast_attn\": false,\n", " \"repetition_penalty\": 1.0,\n", " \"resid_pdrop\": 0.0,\n", " \"return_dict\": true,\n", " \"return_dict_in_generate\": false,\n", " \"scale_attn_by_inverse_layer_idx\": false,\n", " \"scale_attn_weights\": true,\n", " \"sep_token_id\": null,\n", " \"suppress_tokens\": null,\n", " \"task_specific_params\": null,\n", " \"temperature\": 1.0,\n", " \"tf_legacy_loss\": false,\n", " \"tie_encoder_decoder\": false,\n", " \"tie_word_embeddings\": true,\n", " \"tokenizer_class\": null,\n", " \"top_k\": 50,\n", " \"top_p\": 1.0,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"torchscript\": false,\n", " \"transformers_version\": \"4.31.0\",\n", " \"typical_p\": 1.0,\n", " \"use_bfloat16\": false,\n", " \"use_cache\": true,\n", " \"vocab_size\": 50257\n", "}\n" ] } ] }, { "cell_type": "code", "source": [ "import re\n", "import numpy as np\n", "\n", "fout = open(fname_out, \"wb\")\n", "\n", "fout.write(struct.pack(\"i\", 0x67676D6C))\n", "fout.write(struct.pack(\"i\", hparams[\"vocab_size\"]))\n", "fout.write(struct.pack(\"i\", hparams[\"n_positions\"]))\n", "fout.write(struct.pack(\"i\", hparams[\"n_embd\"]))\n", "fout.write(struct.pack(\"i\", hparams[\"n_head\"]))\n", "fout.write(struct.pack(\"i\", hparams[\"n_layer\"]))\n", "fout.write(struct.pack(\"i\", hparams[\"n_inner\"]))\n", "fout.write(struct.pack(\"i\", 1))\n", "\n", "for i in range(hparams[\"vocab_size\"]):\n", " text = tokenizer.decode([i]).encode('utf-8')\n", " fout.write(struct.pack(\"i\", len(text)))\n", " fout.write(text)\n", "\n", "\n", "# for name in list_vars.keys():\n", "# print(name, \"=>\", list_vars[name])\n", "\n", "\n", "for name in list_vars.keys():\n", " if name[-14:] == \".weight_format\":\n", " print(\"FOUND \" + name)\n", " continue\n", "\n", "\n", "\n", "\n", " print(\"Processing variable: \" + name)\n", " data = list_vars[name].squeeze().cpu().type(dtype=torch.float16).numpy()\n", " print(\" with shape: \", data.shape)\n", "\n", " # rename headers to keep compatibility\n", " if name == \"transformer.ln_f.weight\":\n", " name = \"model/ln_f/g\"\n", " elif name == \"transformer.ln_f.bias\":\n", " name = \"model/ln_f/b\"\n", " elif name == \"transformer.wte.weight\":\n", " name = \"model/wte\"\n", " elif name == \"transformer.wpe.weight\":\n", " name = \"model/wpe\"\n", " elif name == \"lm_head.weight\":\n", " name = \"model/lm_head\"\n", " elif name == \"transformer.relative_pe.slopes\":\n", " name = \"model/relative_pe/slopes\"\n", " elif re.match(r\"transformer.h\\.\\d+\\.ln_1\\.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/ln_1/g\"\n", " elif re.match(r\"transformer.h\\.\\d+\\.ln_1\\.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/ln_1/b\"\n", " elif re.match(r\"transformer.h\\.\\d+\\.attn\\.c_attn\\.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/attn/c_attn/w\"\n", " elif re.match(r\"transformer.h\\.\\d+\\.attn\\.c_attn\\.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/attn/c_attn/b\"\n", " elif re.match(r\"transformer.h\\.\\d+\\.attn\\.c_proj\\.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/attn/c_proj/w\"\n", " elif re.match(r\"transformer.h.\\d+.attn.c_proj.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/attn/c_proj/b\"\n", " elif re.match(r\"transformer.h.\\d+.ln_2.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/ln_2/g\"\n", " elif re.match(r\"transformer.h.\\d+.ln_2.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/ln_2/b\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_fc.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_fc/w\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_fc.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_fc/b\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_proj.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_proj/w\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_proj.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_proj/b\"\n", " # NEW\n", " elif re.match(r\"transformer.h.\\d+.attn.c_proj.SCB\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/attn/c_proj/scb\"\n", " elif re.match(r\"transformer.h.\\d+.attn.c_attn.SCB\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/attn/c_attn/scb\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_fc.SCB\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_fc/scb\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_fc2.weight\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_fc2/w\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_fc2.bias\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_fc2/b\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_fc2.SCB\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_fc2/scb\"\n", " elif re.match(r\"transformer.h.\\d+.mlp.c_proj.SCB\", name):\n", " i = re.findall(\"\\d+\", name)[0]\n", " name = f\"model/h{i}/mlp/c_proj/scb\"\n", "\n", " else:\n", " print(\"Unrecognized variable name. %s\", name)\n", "\n", "\n", " n_dims = len(data.shape);\n", "\n", " # ftype == 0 -> float32, ftype == 1 -> float16\n", " ftype = 1;\n", " print(\" Converting to float16\")\n", " data = data.astype(np.float16)\n", " ftype = 8\n", "\n", "\n", " # for efficiency - transpose the projection matrices\n", " # \"model/h.*/attn/c_attn/w\"\n", " # \"model/h.*/attn/c_proj/w\"\n", " # \"model/h.*/mlp/c_fc/w\"\n", " # \"model/h.*/mlp/c_proj/w\"\n", " if name[-14:] == \"/attn/c_attn/w\" or \\\n", " name[-14:] == \"/attn/c_proj/w\" or \\\n", " name[-11:] == \"/mlp/c_fc/w\" or \\\n", " name[-13:] == \"/mlp/c_proj/w\":\n", " print(\" Transposing\")\n", " data = data.transpose()\n", "\n", " # header\n", " str = name.encode('utf-8')\n", " fout.write(struct.pack(\"iii\", n_dims, len(str), ftype))\n", " for i in range(n_dims):\n", " fout.write(struct.pack(\"i\", data.shape[n_dims - 1 - i]))\n", " fout.write(str);\n", "\n", " # data\n", " data.tofile(fout)\n", "\n", "fout.close()\n", "\n", "print(\"Done. Output file: \" + fname_out)\n", "print(\"\")\n", "\n", "\n", "# write_binary()\n", "\n", "\n", "\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nwykMxZFonZd", "outputId": "b28f3092-6659-4de0-a06a-2e87c66435d5" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Processing variable: transformer.wte.weight\n", " with shape: (50257, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.0.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.0.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.0.attn.c_attn.weight_format\n", "Processing variable: transformer.h.0.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.0.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.0.attn.c_proj.weight_format\n", "Processing variable: transformer.h.0.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.0.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.0.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.0.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.0.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.0.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.0.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.0.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.0.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.0.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.1.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.1.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.1.attn.c_attn.weight_format\n", "Processing variable: transformer.h.1.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.1.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.1.attn.c_proj.weight_format\n", "Processing variable: transformer.h.1.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.1.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.1.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.1.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.1.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.1.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.1.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.1.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.1.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.1.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.2.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.2.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.2.attn.c_attn.weight_format\n", "Processing variable: transformer.h.2.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.2.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.2.attn.c_proj.weight_format\n", "Processing variable: transformer.h.2.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.2.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.2.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.2.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.2.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.2.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.2.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.2.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.2.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.2.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.3.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.3.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.3.attn.c_attn.weight_format\n", "Processing variable: transformer.h.3.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.3.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.3.attn.c_proj.weight_format\n", "Processing variable: transformer.h.3.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.3.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.3.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.3.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.3.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.3.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.3.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.3.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.3.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.3.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.4.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.4.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.4.attn.c_attn.weight_format\n", "Processing variable: transformer.h.4.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.4.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.4.attn.c_proj.weight_format\n", "Processing variable: transformer.h.4.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.4.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.4.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.4.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.4.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.4.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.4.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.4.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.4.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.4.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.5.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.5.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.5.attn.c_attn.weight_format\n", "Processing variable: transformer.h.5.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.5.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.5.attn.c_proj.weight_format\n", "Processing variable: transformer.h.5.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.5.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.5.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.5.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.5.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.5.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.5.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.5.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.5.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.5.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.6.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.6.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.6.attn.c_attn.weight_format\n", "Processing variable: transformer.h.6.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.6.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.6.attn.c_proj.weight_format\n", "Processing variable: transformer.h.6.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.6.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.6.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.6.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.6.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.6.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.6.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.6.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.6.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.6.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.7.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.7.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.7.attn.c_attn.weight_format\n", "Processing variable: transformer.h.7.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.7.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.7.attn.c_proj.weight_format\n", "Processing variable: transformer.h.7.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.7.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.7.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.7.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.7.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.7.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.7.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.7.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.7.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.7.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.8.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.8.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.8.attn.c_attn.weight_format\n", "Processing variable: transformer.h.8.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.8.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.8.attn.c_proj.weight_format\n", "Processing variable: transformer.h.8.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.8.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.8.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.8.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.8.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.8.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.8.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.8.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.8.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.8.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.9.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.9.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.9.attn.c_attn.weight_format\n", "Processing variable: transformer.h.9.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.9.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.9.attn.c_proj.weight_format\n", "Processing variable: transformer.h.9.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.9.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.9.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.9.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.9.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.9.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.9.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.9.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.9.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.9.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.10.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.10.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.10.attn.c_attn.weight_format\n", "Processing variable: transformer.h.10.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.10.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.10.attn.c_proj.weight_format\n", "Processing variable: transformer.h.10.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.10.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.10.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.10.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.10.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.10.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.10.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.10.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.10.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.10.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.11.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.11.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.11.attn.c_attn.weight_format\n", "Processing variable: transformer.h.11.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.11.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.11.attn.c_proj.weight_format\n", "Processing variable: transformer.h.11.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.11.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.11.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.11.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.11.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.11.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.11.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.11.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.11.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.11.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.12.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.12.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.12.attn.c_attn.weight_format\n", "Processing variable: transformer.h.12.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.12.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.12.attn.c_proj.weight_format\n", "Processing variable: transformer.h.12.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.12.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.12.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.12.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.12.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.12.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.12.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.12.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.12.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.12.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.13.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.13.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.13.attn.c_attn.weight_format\n", "Processing variable: transformer.h.13.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.13.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.13.attn.c_proj.weight_format\n", "Processing variable: transformer.h.13.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.13.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.13.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.13.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.13.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.13.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.13.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.13.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.13.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.13.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.14.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.14.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.14.attn.c_attn.weight_format\n", "Processing variable: transformer.h.14.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.14.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.14.attn.c_proj.weight_format\n", "Processing variable: transformer.h.14.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.14.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.14.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.14.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.14.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.14.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.14.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.14.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.14.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.14.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.15.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.15.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.15.attn.c_attn.weight_format\n", "Processing variable: transformer.h.15.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.15.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.15.attn.c_proj.weight_format\n", "Processing variable: transformer.h.15.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.15.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.15.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.15.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.15.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.15.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.15.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.15.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.15.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.15.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.16.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.16.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.16.attn.c_attn.weight_format\n", "Processing variable: transformer.h.16.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.16.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.16.attn.c_proj.weight_format\n", "Processing variable: transformer.h.16.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.16.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.16.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.16.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.16.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.16.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.16.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.16.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.16.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.16.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.17.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.17.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.17.attn.c_attn.weight_format\n", "Processing variable: transformer.h.17.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.17.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.17.attn.c_proj.weight_format\n", "Processing variable: transformer.h.17.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.17.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.17.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.17.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.17.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.17.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.17.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.17.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.17.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.17.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.18.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.18.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.18.attn.c_attn.weight_format\n", "Processing variable: transformer.h.18.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.18.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.18.attn.c_proj.weight_format\n", "Processing variable: transformer.h.18.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.18.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.18.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.18.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.18.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.18.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.18.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.18.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.18.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.18.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.19.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.19.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.19.attn.c_attn.weight_format\n", "Processing variable: transformer.h.19.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.19.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.19.attn.c_proj.weight_format\n", "Processing variable: transformer.h.19.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.19.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.19.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.19.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.19.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.19.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.19.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.19.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.19.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.19.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.20.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.20.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.20.attn.c_attn.weight_format\n", "Processing variable: transformer.h.20.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.20.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.20.attn.c_proj.weight_format\n", "Processing variable: transformer.h.20.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.20.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.20.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.20.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.20.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.20.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.20.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.20.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.20.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.20.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.21.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.21.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.21.attn.c_attn.weight_format\n", "Processing variable: transformer.h.21.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.21.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.21.attn.c_proj.weight_format\n", "Processing variable: transformer.h.21.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.21.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.21.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.21.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.21.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.21.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.21.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.21.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.21.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.21.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.22.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.22.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.22.attn.c_attn.weight_format\n", "Processing variable: transformer.h.22.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.22.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.22.attn.c_proj.weight_format\n", "Processing variable: transformer.h.22.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.22.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.22.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.22.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.22.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.22.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.22.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.22.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.22.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.22.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.23.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.23.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.23.attn.c_attn.weight_format\n", "Processing variable: transformer.h.23.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.23.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.23.attn.c_proj.weight_format\n", "Processing variable: transformer.h.23.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.23.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.23.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.23.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.23.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.23.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.23.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.23.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.23.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.23.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.24.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.24.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.24.attn.c_attn.weight_format\n", "Processing variable: transformer.h.24.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.24.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.24.attn.c_proj.weight_format\n", "Processing variable: transformer.h.24.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.24.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.24.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.24.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.24.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.24.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.24.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.24.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.24.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.24.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.25.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.25.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.25.attn.c_attn.weight_format\n", "Processing variable: transformer.h.25.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.25.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.25.attn.c_proj.weight_format\n", "Processing variable: transformer.h.25.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.25.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.25.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.25.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.25.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.25.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.25.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.25.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.25.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.25.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.26.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.26.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.26.attn.c_attn.weight_format\n", "Processing variable: transformer.h.26.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.26.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.26.attn.c_proj.weight_format\n", "Processing variable: transformer.h.26.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.26.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.26.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.26.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.26.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.26.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.26.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.26.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.26.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.26.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.27.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.27.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.27.attn.c_attn.weight_format\n", "Processing variable: transformer.h.27.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.27.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.27.attn.c_proj.weight_format\n", "Processing variable: transformer.h.27.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.27.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.27.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.27.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.27.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.27.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.27.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.27.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.27.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.27.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.28.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.28.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.28.attn.c_attn.weight_format\n", "Processing variable: transformer.h.28.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.28.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.28.attn.c_proj.weight_format\n", "Processing variable: transformer.h.28.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.28.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.28.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.28.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.28.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.28.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.28.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.28.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.28.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.28.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.29.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.29.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.29.attn.c_attn.weight_format\n", "Processing variable: transformer.h.29.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.29.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.29.attn.c_proj.weight_format\n", "Processing variable: transformer.h.29.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.29.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.29.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.29.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.29.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.29.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.29.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.29.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.29.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.29.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.30.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.30.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.30.attn.c_attn.weight_format\n", "Processing variable: transformer.h.30.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.30.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.30.attn.c_proj.weight_format\n", "Processing variable: transformer.h.30.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.30.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.30.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.30.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.30.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.30.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.30.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.30.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.30.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.30.mlp.c_proj.weight_format\n", "Processing variable: transformer.h.31.ln_1.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.ln_1.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.attn.c_attn.weight\n", " with shape: (7680, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.31.attn.c_attn.bias\n", " with shape: (7680,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.attn.c_attn.SCB\n", " with shape: (7680,)\n", " Converting to float16\n", "FOUND transformer.h.31.attn.c_attn.weight_format\n", "Processing variable: transformer.h.31.attn.c_proj.weight\n", " with shape: (2560, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.31.attn.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.attn.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.31.attn.c_proj.weight_format\n", "Processing variable: transformer.h.31.ln_2.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.ln_2.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.mlp.c_fc.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.31.mlp.c_fc.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.mlp.c_fc.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.31.mlp.c_fc.weight_format\n", "Processing variable: transformer.h.31.mlp.c_fc2.weight\n", " with shape: (6832, 2560)\n", " Converting to float16\n", "Processing variable: transformer.h.31.mlp.c_fc2.bias\n", " with shape: (6826,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.mlp.c_fc2.SCB\n", " with shape: (6826,)\n", " Converting to float16\n", "FOUND transformer.h.31.mlp.c_fc2.weight_format\n", "Processing variable: transformer.h.31.mlp.c_proj.weight\n", " with shape: (2560, 6848)\n", " Converting to float16\n", " Transposing\n", "Processing variable: transformer.h.31.mlp.c_proj.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.h.31.mlp.c_proj.SCB\n", " with shape: (2560,)\n", " Converting to float16\n", "FOUND transformer.h.31.mlp.c_proj.weight_format\n", "Processing variable: transformer.ln_f.weight\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.ln_f.bias\n", " with shape: (2560,)\n", " Converting to float16\n", "Processing variable: transformer.relative_pe.slopes\n", " with shape: (32,)\n", " Converting to float16\n", "Processing variable: lm_head.weight\n", " with shape: (50257, 2560)\n", " Converting to float16\n", "Done. Output file: btlm-3b.ggml.bin\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from huggingface_hub import login, HfApi\n", "\n", "login()\n", "\n", "api = HfApi()\n", "\n", "api.upload_file(\n", " path_or_fileobj=\"/content/btlm-3b.ggml.bin\",\n", " path_in_repo=\"btlm-3b.ggml.bin\",\n", " repo_id=\"bornjre/btlm-3b-ggml\",\n", " repo_type=\"model\",\n", ")\n", "\n", "# api.upload_folder(\n", "# folder_path=\"/content/btlm-3b-ggml\",\n", "# repo_id=\"bornjre/btlm-3b-ggml\",\n", "# repo_type=\"model\",\n", "# )" ], "metadata": { "id": "bVYr-O99ONqd", "colab": { "base_uri": "https://localhost:8080/", "height": 196, "referenced_widgets": [ "427689dd96c94e9e8706872de004600c", "f3d6c5dcb2674419a9cc9d0a78a417ea", "1f0e6e0047dc471a91a581505031dcf4", "63e88741fa824103a38134ca7f92efc4", "760f19938d4a420fba6b604413bd75c5", "1dc3baf2eb7a4d3981a7a38d574545ea", "59aec4ac781a42709f49465aaa8beda8", "96c57c6e5c5a446d96664912b2a1f923", "acf63c0070b346f8ba0833bf9d6dbf76", "72da3e460812424db18da699f1adc68f", "6b9996976aeb46a781c2827c7eb80400", "0f5aa3f5a3fd4869ad22a7d7eb3f4a01", "1a4f383e5b6e4edea22cf8269ac3e136", "5e880b14816c40e1ade44bb4e8e3e7a0", "c214e6421b8c444eb91f01729650dfc3", "5ce845d6ca8e40c5ad7c9d07f7ad271c", "4b7fc4873cc44b9eaf915d4e8bd5e134", "c8fcc29ea48f44b49ec3547067348bc2", "665705e3373b4a2093bd921a2341bf99", "e890ec5fafb344609f506f16be57d799", "92302f6d76014a778c6cb7370d2f114c", "a1652fbe06094b51994fc22f16f9615a", "a122a0fc52604974a77fe18f7dd43c2f", "dfa55c882b3d44c395d89c53af5f68d7", "fac01390a0634c0caa885e4626fda033", "6c8e3fe6bca1438a879a0ee18abc5294", "d9ffd07e99cb4a2da0966f653351e6cf", "0005a3238bb84c619e657100845f9c84", "b7ace70b606943059acf751b89b53291", "b6e56ddf9da04a52b99de60c9e5a3f57", "55b11341f930432282c18b908e8c7c4a", "a4174533851147d0b5452428cfbcb4d0", "e4cc83f17f3941a69be2c5a328c1df7c", "9f696bef605146c088cd39c3e2d41fa7", "4b995377bec241c6b61b7e68518403bb", "b7e2a45a24724bc79db6b055ff3c6e90", "1b2fb7402b6d4bdcbe38e3e1e9179b74", "51921dbb33f445f7ab72666ecac3f85c", "e804d48983884eb7bfd85123482317f7", "a15221a1538c43c7ac162324644f2c2d", "baac4e68e10c4e7bbffeff26b066f308", "04ed18f2ff804865af93d4d73b347d77", "603ecce9c961468eb90510fe0587bc32" ] }, "outputId": "1a0aef32-812f-4265-91d6-fe66f4f2ce17" }, "execution_count": 9, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='