{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "A100", "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "366e5a0ac67d4e0e94da459f3e69804e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_3c6cd74053f74ac18c4f5bbfb9a2fc69", "IPY_MODEL_22d5df7f49b34fec91c7eb4e7e4ab33e", "IPY_MODEL_25153fcf872048379de7c71420f3a581" ], "layout": "IPY_MODEL_a1883d8b08cc458287224bc89aeb54d1" } }, "3c6cd74053f74ac18c4f5bbfb9a2fc69": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e03078ea896e41e7bcd922afd77b83c9", "placeholder": "", "style": "IPY_MODEL_793237ce29034606b2b34bf559cd87da", "value": "tokenizer_config.json: 100%" } }, "22d5df7f49b34fec91c7eb4e7e4ab33e": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_54177c30c7974ab9ac986cb9aa17793c", "max": 25, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_95a31c2e01744ccca1fd1d07e1e99d19", "value": 25 } }, "25153fcf872048379de7c71420f3a581": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1d20d5f57db24eb59f4f633ee1443495", "placeholder": "", "style": "IPY_MODEL_31f2258ec506441e83752bfa67d53398", "value": " 25.0/25.0 [00:00<00:00, 1.86kB/s]" } }, "a1883d8b08cc458287224bc89aeb54d1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e03078ea896e41e7bcd922afd77b83c9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "793237ce29034606b2b34bf559cd87da": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "54177c30c7974ab9ac986cb9aa17793c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "95a31c2e01744ccca1fd1d07e1e99d19": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1d20d5f57db24eb59f4f633ee1443495": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "31f2258ec506441e83752bfa67d53398": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "4628c887a3404cb79319e2586cbf81af": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8ae15ae97e85478aaf8ff109349f419a", "IPY_MODEL_adc84a2b4e54479d927ae5b253eb90c2", "IPY_MODEL_549602a8d77241929793d70afa0d54b9" ], "layout": "IPY_MODEL_5d1d0adb88b748e4859c71019a0cf8e2" } }, "8ae15ae97e85478aaf8ff109349f419a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b0c34ffabd284318842c23cc4baba1cf", "placeholder": "", "style": "IPY_MODEL_b30aeec96e4d4826bab3c207561b4778", "value": "sentencepiece.bpe.model: 100%" } }, "adc84a2b4e54479d927ae5b253eb90c2": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_10b114cb480141cbab6a26f9a89d2a7e", "max": 5069051, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_3943a1720767453784dfaa6e9017afb2", "value": 5069051 } }, "549602a8d77241929793d70afa0d54b9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1d26be052e6d4d479a2c4c68f027a719", "placeholder": "", "style": "IPY_MODEL_5c35bb1be95e4d6c9736330953e045e3", "value": " 5.07M/5.07M [00:01<00:00, 3.39MB/s]" } }, "5d1d0adb88b748e4859c71019a0cf8e2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b0c34ffabd284318842c23cc4baba1cf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b30aeec96e4d4826bab3c207561b4778": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "10b114cb480141cbab6a26f9a89d2a7e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3943a1720767453784dfaa6e9017afb2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1d26be052e6d4d479a2c4c68f027a719": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5c35bb1be95e4d6c9736330953e045e3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "550652ab3d9f482ba2a5485cd84c939b": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_09a5d2c99fb9434ab90b3200cd51a3ae", "IPY_MODEL_b4dbc8e0dbd342d19c5f652a004bc765", "IPY_MODEL_4dc271194c7648c8894dd510a69c103d" ], "layout": "IPY_MODEL_4debd0c75c79416d917ea5641e4a8841" } }, "09a5d2c99fb9434ab90b3200cd51a3ae": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7c644402f92b408182ab014e2ea02daa", "placeholder": "", "style": "IPY_MODEL_affe4914cd6f41e39124f093e36cdb07", "value": "tokenizer.json: 100%" } }, "b4dbc8e0dbd342d19c5f652a004bc765": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ea2d20664c5640ff87cd1b909800722c", "max": 9096718, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_ca95df7382f2412b9328f96a463209a1", "value": 9096718 } }, "4dc271194c7648c8894dd510a69c103d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_186e1b0766044f718d2024645c4e87c7", "placeholder": "", "style": "IPY_MODEL_57d59fcaff5e466b8605b23887650cf7", "value": " 9.10M/9.10M [00:01<00:00, 5.30MB/s]" } }, "4debd0c75c79416d917ea5641e4a8841": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7c644402f92b408182ab014e2ea02daa": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "affe4914cd6f41e39124f093e36cdb07": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ea2d20664c5640ff87cd1b909800722c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ca95df7382f2412b9328f96a463209a1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "186e1b0766044f718d2024645c4e87c7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "57d59fcaff5e466b8605b23887650cf7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ce139b88df824efea4d55e4813ee1b88": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_1fb3250b1b5540d8a9365435900db8b5", "IPY_MODEL_675aa319a3504e22a9b1d58eff9188a2", "IPY_MODEL_48e49cdb0ec8417782ed042ca84d4597" ], "layout": "IPY_MODEL_f15259b4926d40b5a70ee8eb5213e9f5" } }, "1fb3250b1b5540d8a9365435900db8b5": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b0f1e42f5e4f4ac8b1c4ca12cfebabec", "placeholder": "", "style": "IPY_MODEL_41ea4f253b6b44129196e0d894777c4a", "value": "Map: 100%" } }, "675aa319a3504e22a9b1d58eff9188a2": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ccf8fe1474d540a7be7b6757119d92fd", "max": 99545, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_34326871a80140679ffe40ac560192a9", "value": 99545 } }, "48e49cdb0ec8417782ed042ca84d4597": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_19987899825a49b19c31a7225d3ff0b8", "placeholder": "", "style": "IPY_MODEL_ed46a4b1d6b647fcaa01526262b19431", "value": " 99545/99545 [00:52<00:00, 1964.80 examples/s]" } }, "f15259b4926d40b5a70ee8eb5213e9f5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b0f1e42f5e4f4ac8b1c4ca12cfebabec": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "41ea4f253b6b44129196e0d894777c4a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ccf8fe1474d540a7be7b6757119d92fd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "34326871a80140679ffe40ac560192a9": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "19987899825a49b19c31a7225d3ff0b8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ed46a4b1d6b647fcaa01526262b19431": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "!pip install transformers datasets seqeval huggingface_hub\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5v8KnAaD-z9t", "outputId": "01e664a6-6621-4ccb-cb02-25e09af4fa9f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n", "Requirement already satisfied: seqeval in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.24.7)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n", "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.10/dist-packages (from seqeval) (1.5.2)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.0)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.13.1)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.5.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n" ] } ] }, { "cell_type": "code", "source": [ "# Standard library imports\n", "import os # Provides functions for interacting with the operating system\n", "import warnings # Used to handle or suppress warnings\n", "import numpy as np # Essential for numerical operations and array manipulation\n", "import torch # PyTorch library for tensor computations and model handling\n", "import ast # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)\n", "\n", "# Hugging Face and Transformers imports\n", "from datasets import load_dataset # Loads datasets for model training and evaluation\n", "from transformers import (\n", " AutoTokenizer, # Initializes a tokenizer from a pre-trained model\n", " DataCollatorForTokenClassification, # Handles padding and formatting of token classification data\n", " TrainingArguments, # Defines training parameters like batch size and learning rate\n", " Trainer, # High-level API for managing training and evaluation\n", " AutoModelForTokenClassification, # Loads a pre-trained model for token classification tasks\n", " get_linear_schedule_with_warmup, # Learning rate scheduler for gradual warm-up and linear decay\n", " EarlyStoppingCallback # Callback to stop training if validation performance plateaus\n", ")\n", "\n", "# Hugging Face Hub\n", "from huggingface_hub import login # Allows logging in to Hugging Face Hub to upload models\n", "\n", "# seqeval metrics for NER evaluation\n", "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", "# Provides precision, recall, F1-score, and classification report for evaluating NER model performance\n" ], "metadata": { "id": "amREIFSH-z7r" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Log in to Hugging Face Hub\n", "login(token=\"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\")\n" ], "metadata": { "id": "K7adlboI-z4p", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "88717ba2-23e2-4aff-d1cf-ca876f0f3d46" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: fineGrained).\n", "Your token has been saved to /root/.cache/huggingface/token\n", "Login successful\n" ] } ] }, { "cell_type": "code", "source": [ "# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training\n", "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", "\n", "# Suppress warning messages to keep output clean, especially during training and evaluation\n", "warnings.filterwarnings(\"ignore\")\n" ], "metadata": { "id": "Qccgsjfs-zzA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Load the Azerbaijani NER dataset from Hugging Face\n", "dataset = load_dataset(\"LocalDoc/azerbaijani-ner-dataset\")\n", "print(dataset) # Display dataset structure (e.g., train/validation splits)\n", "\n", "# Preprocessing function to format tokens and NER tags correctly\n", "def preprocess_example(example):\n", " try:\n", " # Convert string of tokens to a list and parse NER tags to integers\n", " example[\"tokens\"] = ast.literal_eval(example[\"tokens\"])\n", " example[\"ner_tags\"] = list(map(int, ast.literal_eval(example[\"ner_tags\"])))\n", " except (ValueError, SyntaxError) as e:\n", " # Skip and log malformed examples, ensuring error resilience\n", " print(f\"Skipping malformed example: {example['index']} due to error: {e}\")\n", " example[\"tokens\"] = []\n", " example[\"ner_tags\"] = []\n", " return example\n", "\n", "# Apply preprocessing to each dataset entry, ensuring consistent formatting\n", "dataset = dataset.map(preprocess_example)\n" ], "metadata": { "id": "fQ6ttUM8-zwM", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "362280bb-16c3-4462-f568-6eba09915ec1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['index', 'tokens', 'ner_tags'],\n", " num_rows: 99545\n", " })\n", "})\n" ] } ] }, { "cell_type": "code", "source": [ "# Initialize the tokenizer for multilingual NER using xlm-roberta-large\n", "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n", "\n", "# Function to tokenize input and align labels with tokenized words\n", "def tokenize_and_align_labels(example):\n", " # Tokenize the sentence while preserving word boundaries for correct NER tag alignment\n", " tokenized_inputs = tokenizer(\n", " example[\"tokens\"], # List of words (tokens) in the sentence\n", " truncation=True, # Truncate sentences longer than max_length\n", " is_split_into_words=True, # Specify that input is a list of words\n", " padding=\"max_length\", # Pad to maximum sequence length\n", " max_length=128, # Set the maximum sequence length to 128 tokens\n", " )\n", "\n", " labels = [] # List to store aligned NER labels\n", " word_ids = tokenized_inputs.word_ids() # Get word IDs for each token\n", " previous_word_idx = None # Initialize previous word index for tracking\n", "\n", " # Loop through word indices to align NER tags with subword tokens\n", " for word_idx in word_ids:\n", " if word_idx is None:\n", " labels.append(-100) # Set padding token labels to -100 (ignored in loss)\n", " elif word_idx != previous_word_idx:\n", " # Assign the label from example's NER tags if word index matches\n", " labels.append(example[\"ner_tags\"][word_idx] if word_idx < len(example[\"ner_tags\"]) else -100)\n", " else:\n", " labels.append(-100) # Label subword tokens with -100 to avoid redundant labels\n", " previous_word_idx = word_idx # Update previous word index\n", "\n", " tokenized_inputs[\"labels\"] = labels # Add labels to tokenized inputs\n", " return tokenized_inputs\n", "\n", "# Apply tokenization and label alignment function to the dataset\n", "tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)\n" ], "metadata": { "id": "-24SJijT-zth", "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "366e5a0ac67d4e0e94da459f3e69804e", "3c6cd74053f74ac18c4f5bbfb9a2fc69", "22d5df7f49b34fec91c7eb4e7e4ab33e", "25153fcf872048379de7c71420f3a581", "a1883d8b08cc458287224bc89aeb54d1", "e03078ea896e41e7bcd922afd77b83c9", "793237ce29034606b2b34bf559cd87da", "54177c30c7974ab9ac986cb9aa17793c", "95a31c2e01744ccca1fd1d07e1e99d19", "1d20d5f57db24eb59f4f633ee1443495", "31f2258ec506441e83752bfa67d53398", "4628c887a3404cb79319e2586cbf81af", "8ae15ae97e85478aaf8ff109349f419a", "adc84a2b4e54479d927ae5b253eb90c2", "549602a8d77241929793d70afa0d54b9", "5d1d0adb88b748e4859c71019a0cf8e2", "b0c34ffabd284318842c23cc4baba1cf", "b30aeec96e4d4826bab3c207561b4778", "10b114cb480141cbab6a26f9a89d2a7e", "3943a1720767453784dfaa6e9017afb2", "1d26be052e6d4d479a2c4c68f027a719", "5c35bb1be95e4d6c9736330953e045e3", "550652ab3d9f482ba2a5485cd84c939b", "09a5d2c99fb9434ab90b3200cd51a3ae", "b4dbc8e0dbd342d19c5f652a004bc765", "4dc271194c7648c8894dd510a69c103d", "4debd0c75c79416d917ea5641e4a8841", "7c644402f92b408182ab014e2ea02daa", "affe4914cd6f41e39124f093e36cdb07", "ea2d20664c5640ff87cd1b909800722c", "ca95df7382f2412b9328f96a463209a1", "186e1b0766044f718d2024645c4e87c7", "57d59fcaff5e466b8605b23887650cf7", "ce139b88df824efea4d55e4813ee1b88", "1fb3250b1b5540d8a9365435900db8b5", "675aa319a3504e22a9b1d58eff9188a2", "48e49cdb0ec8417782ed042ca84d4597", "f15259b4926d40b5a70ee8eb5213e9f5", "b0f1e42f5e4f4ac8b1c4ca12cfebabec", "41ea4f253b6b44129196e0d894777c4a", "ccf8fe1474d540a7be7b6757119d92fd", "34326871a80140679ffe40ac560192a9", "19987899825a49b19c31a7225d3ff0b8", "ed46a4b1d6b647fcaa01526262b19431" ] }, "outputId": "ddc67c6c-b931-466e-8da8-90c7ead34f0d" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "366e5a0ac67d4e0e94da459f3e69804e" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "4628c887a3404cb79319e2586cbf81af" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "tokenizer.json: 0%| | 0.00/9.10M [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "550652ab3d9f482ba2a5485cd84c939b" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Map: 0%| | 0/99545 [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "ce139b88df824efea4d55e4813ee1b88" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Create a 90-10 split of the dataset for training and validation\n", "tokenized_datasets = tokenized_datasets[\"train\"].train_test_split(test_size=0.1)\n", "print(tokenized_datasets) # Output structure of split datasets" ], "metadata": { "id": "DA7mW2it-zoo", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "6c8b73c2-6192-4bd4-87fe-86856ee70625" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],\n", " num_rows: 89590\n", " })\n", " test: Dataset({\n", " features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],\n", " num_rows: 9955\n", " })\n", "})\n" ] } ] }, { "cell_type": "code", "source": [ "# Define a list of entity labels for NER tagging with B- (beginning) and I- (inside) markers\n", "label_list = [\n", " \"O\", # Outside of a named entity\n", " \"B-PERSON\", \"I-PERSON\", # Person name (e.g., \"John\" in \"John Doe\")\n", " \"B-LOCATION\", \"I-LOCATION\", # Geographical location (e.g., \"Paris\")\n", " \"B-ORGANISATION\", \"I-ORGANISATION\", # Organization name (e.g., \"UNICEF\")\n", " \"B-DATE\", \"I-DATE\", # Date entity (e.g., \"2024-11-05\")\n", " \"B-TIME\", \"I-TIME\", # Time (e.g., \"12:00 PM\")\n", " \"B-MONEY\", \"I-MONEY\", # Monetary values (e.g., \"$20\")\n", " \"B-PERCENTAGE\", \"I-PERCENTAGE\", # Percentage values (e.g., \"20%\")\n", " \"B-FACILITY\", \"I-FACILITY\", # Physical facilities (e.g., \"Airport\")\n", " \"B-PRODUCT\", \"I-PRODUCT\", # Product names (e.g., \"iPhone\")\n", " \"B-EVENT\", \"I-EVENT\", # Named events (e.g., \"Olympics\")\n", " \"B-ART\", \"I-ART\", # Works of art (e.g., \"Mona Lisa\")\n", " \"B-LAW\", \"I-LAW\", # Laws and legal documents (e.g., \"Article 50\")\n", " \"B-LANGUAGE\", \"I-LANGUAGE\", # Languages (e.g., \"Azerbaijani\")\n", " \"B-GPE\", \"I-GPE\", # Geopolitical entities (e.g., \"Europe\")\n", " \"B-NORP\", \"I-NORP\", # Nationalities, religious groups, political groups\n", " \"B-ORDINAL\", \"I-ORDINAL\", # Ordinal indicators (e.g., \"first\", \"second\")\n", " \"B-CARDINAL\", \"I-CARDINAL\", # Cardinal numbers (e.g., \"three\")\n", " \"B-DISEASE\", \"I-DISEASE\", # Diseases (e.g., \"COVID-19\")\n", " \"B-CONTACT\", \"I-CONTACT\", # Contact info (e.g., email or phone number)\n", " \"B-ADAGE\", \"I-ADAGE\", # Common sayings or adages\n", " \"B-QUANTITY\", \"I-QUANTITY\", # Quantities (e.g., \"5 km\")\n", " \"B-MISCELLANEOUS\", \"I-MISCELLANEOUS\", # Miscellaneous entities not fitting other categories\n", " \"B-POSITION\", \"I-POSITION\", # Job titles or positions (e.g., \"CEO\")\n", " \"B-PROJECT\", \"I-PROJECT\" # Project names (e.g., \"Project Apollo\")\n", "]" ], "metadata": { "id": "-lVHfKEE-zmm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialize a data collator to handle padding and formatting for token classification\n", "data_collator = DataCollatorForTokenClassification(tokenizer)\n", "\n", "# Load a pre-trained model for token classification, adapted for NER tasks\n", "model = AutoModelForTokenClassification.from_pretrained(\n", " \"xlm-roberta-large\", # Base model (multilingual XLM-RoBERTa) for NER\n", " num_labels=len(label_list) # Set the number of output labels to match NER categories\n", ")\n" ], "metadata": { "id": "jUfWCaen-zjr", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5399146a-29d0-4dfd-a93b-dc22779dbbdd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ] }, { "cell_type": "code", "source": [ "# Define a function to compute evaluation metrics for the model's predictions\n", "def compute_metrics(p):\n", " predictions, labels = p # Unpack predictions and true labels from the input\n", "\n", " # Convert logits to predicted label indices by taking the argmax along the last axis\n", " predictions = np.argmax(predictions, axis=2)\n", "\n", " # Filter out special padding labels (-100) and convert indices to label names\n", " true_labels = [[label_list[l] for l in label if l != -100] for label in labels]\n", " true_predictions = [\n", " [label_list[p] for (p, l) in zip(prediction, label) if l != -100]\n", " for prediction, label in zip(predictions, labels)\n", " ]\n", "\n", " # Print a detailed classification report for each label category\n", " print(classification_report(true_labels, true_predictions))\n", "\n", " # Calculate and return key evaluation metrics\n", " return {\n", " # Precision measures the accuracy of predicted positive instances\n", " # Important in NER to ensure entity predictions are correct and reduce false positives.\n", " \"precision\": precision_score(true_labels, true_predictions),\n", "\n", " # Recall measures the model's ability to capture all relevant entities\n", " # Essential in NER to ensure the model captures all entities, reducing false negatives.\n", " \"recall\": recall_score(true_labels, true_predictions),\n", "\n", " # F1-score is the harmonic mean of precision and recall, balancing both metrics\n", " # Useful in NER for providing an overall performance measure, especially when precision and recall are both important.\n", " \"f1\": f1_score(true_labels, true_predictions),\n", " }" ], "metadata": { "id": "9b7EajE_-zhS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Set up training arguments for model training, defining essential training configurations\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\", # Directory to save model checkpoints and final outputs\n", " evaluation_strategy=\"epoch\", # Evaluate model on the validation set at the end of each epoch\n", " save_strategy=\"epoch\", # Save model checkpoints at the end of each epoch\n", " learning_rate=2e-5, # Set a low learning rate to ensure stable training for fine-tuning\n", " per_device_train_batch_size=128, # Number of examples per batch during training, balancing speed and memory\n", " per_device_eval_batch_size=128, # Number of examples per batch during evaluation\n", " num_train_epochs=12, # Number of full training passes over the dataset\n", " weight_decay=0.005, # Regularization term to prevent overfitting by penalizing large weights\n", " fp16=True, # Use 16-bit floating point for faster and memory-efficient training\n", " logging_dir='./logs', # Directory to store training logs\n", " save_total_limit=2, # Keep only the 2 latest model checkpoints to save storage space\n", " load_best_model_at_end=True, # Load the best model based on metrics at the end of training\n", " metric_for_best_model=\"f1\", # Use F1-score to determine the best model checkpoint\n", " report_to=\"none\" # Disable reporting to external services (useful in local runs)\n", ")\n" ], "metadata": { "id": "PmJTMpp6-zew" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialize the Trainer class to manage the training loop with all necessary components\n", "trainer = Trainer(\n", " model=model, # The pre-trained model to be fine-tuned\n", " args=training_args, # Training configuration parameters defined in TrainingArguments\n", " train_dataset=tokenized_datasets[\"train\"], # Tokenized training dataset\n", " eval_dataset=tokenized_datasets[\"test\"], # Tokenized validation dataset\n", " tokenizer=tokenizer, # Tokenizer used for processing input text\n", " data_collator=data_collator, # Data collator for padding and batching during training\n", " compute_metrics=compute_metrics, # Function to calculate evaluation metrics like precision, recall, F1\n", " callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Stop training early if validation metrics don't improve for 2 epochs\n", ")\n" ], "metadata": { "id": "WqoF7QJy-zb2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Begin the training process and capture the training metrics\n", "training_metrics = trainer.train()\n", "\n", "# Evaluate the model on the validation set after training\n", "eval_results = trainer.evaluate()\n", "\n", "# Print evaluation results, including precision, recall, and F1-score\n", "print(eval_results)\n" ], "metadata": { "id": "QveYYwvA-zUR", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "a432a1a6-fc14-471e-ad2f-ec25e15fcac8" }, "execution_count": null, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Precision | \n", "Recall | \n", "F1 | \n", "
---|---|---|---|---|---|
1 | \n", "0.407500 | \n", "0.253823 | \n", "0.768923 | \n", "0.721350 | \n", "0.744377 | \n", "
2 | \n", "0.255600 | \n", "0.249694 | \n", "0.783549 | \n", "0.724464 | \n", "0.752849 | \n", "
3 | \n", "0.214400 | \n", "0.248773 | \n", "0.750857 | \n", "0.748900 | \n", "0.749877 | \n", "
4 | \n", "0.193400 | \n", "0.257051 | \n", "0.768623 | \n", "0.740371 | \n", "0.754232 | \n", "
5 | \n", "0.169800 | \n", "0.275679 | \n", "0.745789 | \n", "0.753740 | \n", "0.749743 | \n", "
6 | \n", "0.152600 | \n", "0.288074 | \n", "0.783131 | \n", "0.728423 | \n", "0.754787 | \n", "
7 | \n", "0.144300 | \n", "0.303378 | \n", "0.758504 | \n", "0.738069 | \n", "0.748147 | \n", "
8 | \n", "0.126800 | \n", "0.311300 | \n", "0.745589 | \n", "0.750863 | \n", "0.748217 | \n", "
9 | \n", "0.119400 | \n", "0.331631 | \n", "0.739316 | \n", "0.749475 | \n", "0.744361 | \n", "
"
],
"text/plain": [
" "
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
" precision recall f1-score support\n",
"\n",
" ART 0.30 0.21 0.25 1828\n",
" DATE 0.52 0.52 0.52 834\n",
" EVENT 0.63 0.54 0.58 63\n",
" FACILITY 0.73 0.70 0.71 1134\n",
" LAW 0.60 0.59 0.60 1066\n",
" LOCATION 0.79 0.79 0.79 8795\n",
" MONEY 0.55 0.60 0.57 555\n",
"ORGANISATION 0.64 0.68 0.66 554\n",
" PERCENTAGE 0.78 0.82 0.80 3502\n",
" PERSON 0.87 0.84 0.85 7007\n",
" PRODUCT 0.83 0.84 0.83 2624\n",
" TIME 0.58 0.56 0.57 1584\n",
"\n",
" micro avg 0.75 0.74 0.75 29546\n",
" macro avg 0.65 0.64 0.65 29546\n",
"weighted avg 0.75 0.74 0.74 29546\n",
"\n",
" precision recall f1-score support\n",
"\n",
" ART 0.32 0.22 0.26 1828\n",
" DATE 0.51 0.52 0.51 834\n",
" EVENT 0.64 0.54 0.59 63\n",
" FACILITY 0.73 0.69 0.71 1134\n",
" LAW 0.60 0.59 0.60 1066\n",
" LOCATION 0.79 0.80 0.79 8795\n",
" MONEY 0.53 0.58 0.55 555\n",
"ORGANISATION 0.65 0.68 0.66 554\n",
" PERCENTAGE 0.79 0.82 0.80 3502\n",
" PERSON 0.87 0.84 0.85 7007\n",
" PRODUCT 0.83 0.85 0.84 2624\n",
" TIME 0.58 0.57 0.57 1584\n",
"\n",
" micro avg 0.75 0.74 0.75 29546\n",
" macro avg 0.65 0.64 0.65 29546\n",
"weighted avg 0.74 0.74 0.74 29546\n",
"\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"\n",
" \n",
"
\n",
" \n",
" \n",
" \n",
" Epoch \n",
" Training Loss \n",
" Validation Loss \n",
" Precision \n",
" Recall \n",
" F1 \n",
" \n",
" \n",
" 1 \n",
" 0.407500 \n",
" 0.253823 \n",
" 0.768923 \n",
" 0.721350 \n",
" 0.744377 \n",
" \n",
" \n",
" 2 \n",
" 0.255600 \n",
" 0.249694 \n",
" 0.783549 \n",
" 0.724464 \n",
" 0.752849 \n",
" \n",
" \n",
" 3 \n",
" 0.214400 \n",
" 0.248773 \n",
" 0.750857 \n",
" 0.748900 \n",
" 0.749877 \n",
" \n",
" \n",
" 4 \n",
" 0.193400 \n",
" 0.257051 \n",
" 0.768623 \n",
" 0.740371 \n",
" 0.754232 \n",
" \n",
" \n",
" 5 \n",
" 0.169800 \n",
" 0.275679 \n",
" 0.745789 \n",
" 0.753740 \n",
" 0.749743 \n",
" \n",
" \n",
" 6 \n",
" 0.152600 \n",
" 0.288074 \n",
" 0.783131 \n",
" 0.728423 \n",
" 0.754787 \n",
" \n",
" \n",
" 7 \n",
" 0.144300 \n",
" 0.303378 \n",
" 0.758504 \n",
" 0.738069 \n",
" 0.748147 \n",
" \n",
" \n",
" 8 \n",
" 0.126800 \n",
" 0.311300 \n",
" 0.745589 \n",
" 0.750863 \n",
" 0.748217 \n",
" \n",
" \n",
" 9 \n",
" 0.119400 \n",
" 0.331631 \n",
" 0.739316 \n",
" 0.749475 \n",
" 0.744361 \n",
" \n",
" \n",
" 10 \n",
" 0.109400 \n",
" 0.344823 \n",
" 0.754268 \n",
" 0.737189 \n",
" 0.745631 \n",
" \n",
" \n",
" \n",
"11 \n",
" 0.102900 \n",
" 0.354887 \n",
" 0.751948 \n",
" 0.741285 \n",
" 0.746578 \n",
"