{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "A100", "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "366e5a0ac67d4e0e94da459f3e69804e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_3c6cd74053f74ac18c4f5bbfb9a2fc69", "IPY_MODEL_22d5df7f49b34fec91c7eb4e7e4ab33e", "IPY_MODEL_25153fcf872048379de7c71420f3a581" ], "layout": "IPY_MODEL_a1883d8b08cc458287224bc89aeb54d1" } }, "3c6cd74053f74ac18c4f5bbfb9a2fc69": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e03078ea896e41e7bcd922afd77b83c9", "placeholder": "​", "style": "IPY_MODEL_793237ce29034606b2b34bf559cd87da", "value": "tokenizer_config.json: 100%" } }, "22d5df7f49b34fec91c7eb4e7e4ab33e": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_54177c30c7974ab9ac986cb9aa17793c", "max": 25, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_95a31c2e01744ccca1fd1d07e1e99d19", "value": 25 } }, "25153fcf872048379de7c71420f3a581": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1d20d5f57db24eb59f4f633ee1443495", "placeholder": "​", "style": "IPY_MODEL_31f2258ec506441e83752bfa67d53398", "value": " 25.0/25.0 [00:00<00:00, 1.86kB/s]" } }, "a1883d8b08cc458287224bc89aeb54d1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e03078ea896e41e7bcd922afd77b83c9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "793237ce29034606b2b34bf559cd87da": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "54177c30c7974ab9ac986cb9aa17793c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "95a31c2e01744ccca1fd1d07e1e99d19": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1d20d5f57db24eb59f4f633ee1443495": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "31f2258ec506441e83752bfa67d53398": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "4628c887a3404cb79319e2586cbf81af": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8ae15ae97e85478aaf8ff109349f419a", "IPY_MODEL_adc84a2b4e54479d927ae5b253eb90c2", "IPY_MODEL_549602a8d77241929793d70afa0d54b9" ], "layout": "IPY_MODEL_5d1d0adb88b748e4859c71019a0cf8e2" } }, "8ae15ae97e85478aaf8ff109349f419a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b0c34ffabd284318842c23cc4baba1cf", "placeholder": "​", "style": "IPY_MODEL_b30aeec96e4d4826bab3c207561b4778", "value": "sentencepiece.bpe.model: 100%" } }, "adc84a2b4e54479d927ae5b253eb90c2": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_10b114cb480141cbab6a26f9a89d2a7e", "max": 5069051, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_3943a1720767453784dfaa6e9017afb2", "value": 5069051 } }, "549602a8d77241929793d70afa0d54b9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1d26be052e6d4d479a2c4c68f027a719", "placeholder": "​", "style": "IPY_MODEL_5c35bb1be95e4d6c9736330953e045e3", "value": " 5.07M/5.07M [00:01<00:00, 3.39MB/s]" } }, "5d1d0adb88b748e4859c71019a0cf8e2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b0c34ffabd284318842c23cc4baba1cf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b30aeec96e4d4826bab3c207561b4778": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "10b114cb480141cbab6a26f9a89d2a7e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3943a1720767453784dfaa6e9017afb2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1d26be052e6d4d479a2c4c68f027a719": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5c35bb1be95e4d6c9736330953e045e3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "550652ab3d9f482ba2a5485cd84c939b": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_09a5d2c99fb9434ab90b3200cd51a3ae", "IPY_MODEL_b4dbc8e0dbd342d19c5f652a004bc765", "IPY_MODEL_4dc271194c7648c8894dd510a69c103d" ], "layout": "IPY_MODEL_4debd0c75c79416d917ea5641e4a8841" } }, "09a5d2c99fb9434ab90b3200cd51a3ae": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7c644402f92b408182ab014e2ea02daa", "placeholder": "​", "style": "IPY_MODEL_affe4914cd6f41e39124f093e36cdb07", "value": "tokenizer.json: 100%" } }, "b4dbc8e0dbd342d19c5f652a004bc765": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ea2d20664c5640ff87cd1b909800722c", "max": 9096718, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_ca95df7382f2412b9328f96a463209a1", "value": 9096718 } }, "4dc271194c7648c8894dd510a69c103d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_186e1b0766044f718d2024645c4e87c7", "placeholder": "​", "style": "IPY_MODEL_57d59fcaff5e466b8605b23887650cf7", "value": " 9.10M/9.10M [00:01<00:00, 5.30MB/s]" } }, "4debd0c75c79416d917ea5641e4a8841": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7c644402f92b408182ab014e2ea02daa": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "affe4914cd6f41e39124f093e36cdb07": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ea2d20664c5640ff87cd1b909800722c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ca95df7382f2412b9328f96a463209a1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "186e1b0766044f718d2024645c4e87c7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "57d59fcaff5e466b8605b23887650cf7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ce139b88df824efea4d55e4813ee1b88": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_1fb3250b1b5540d8a9365435900db8b5", "IPY_MODEL_675aa319a3504e22a9b1d58eff9188a2", "IPY_MODEL_48e49cdb0ec8417782ed042ca84d4597" ], "layout": "IPY_MODEL_f15259b4926d40b5a70ee8eb5213e9f5" } }, "1fb3250b1b5540d8a9365435900db8b5": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b0f1e42f5e4f4ac8b1c4ca12cfebabec", "placeholder": "​", "style": "IPY_MODEL_41ea4f253b6b44129196e0d894777c4a", "value": "Map: 100%" } }, "675aa319a3504e22a9b1d58eff9188a2": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ccf8fe1474d540a7be7b6757119d92fd", "max": 99545, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_34326871a80140679ffe40ac560192a9", "value": 99545 } }, "48e49cdb0ec8417782ed042ca84d4597": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_19987899825a49b19c31a7225d3ff0b8", "placeholder": "​", "style": "IPY_MODEL_ed46a4b1d6b647fcaa01526262b19431", "value": " 99545/99545 [00:52<00:00, 1964.80 examples/s]" } }, "f15259b4926d40b5a70ee8eb5213e9f5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b0f1e42f5e4f4ac8b1c4ca12cfebabec": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "41ea4f253b6b44129196e0d894777c4a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ccf8fe1474d540a7be7b6757119d92fd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "34326871a80140679ffe40ac560192a9": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "19987899825a49b19c31a7225d3ff0b8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ed46a4b1d6b647fcaa01526262b19431": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "!pip install transformers datasets seqeval huggingface_hub\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5v8KnAaD-z9t", "outputId": "01e664a6-6621-4ccb-cb02-25e09af4fa9f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n", "Requirement already satisfied: seqeval in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.24.7)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n", "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.10/dist-packages (from seqeval) (1.5.2)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.0)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.13.1)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.5.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n" ] } ] }, { "cell_type": "code", "source": [ "# Standard library imports\n", "import os # Provides functions for interacting with the operating system\n", "import warnings # Used to handle or suppress warnings\n", "import numpy as np # Essential for numerical operations and array manipulation\n", "import torch # PyTorch library for tensor computations and model handling\n", "import ast # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)\n", "\n", "# Hugging Face and Transformers imports\n", "from datasets import load_dataset # Loads datasets for model training and evaluation\n", "from transformers import (\n", " AutoTokenizer, # Initializes a tokenizer from a pre-trained model\n", " DataCollatorForTokenClassification, # Handles padding and formatting of token classification data\n", " TrainingArguments, # Defines training parameters like batch size and learning rate\n", " Trainer, # High-level API for managing training and evaluation\n", " AutoModelForTokenClassification, # Loads a pre-trained model for token classification tasks\n", " get_linear_schedule_with_warmup, # Learning rate scheduler for gradual warm-up and linear decay\n", " EarlyStoppingCallback # Callback to stop training if validation performance plateaus\n", ")\n", "\n", "# Hugging Face Hub\n", "from huggingface_hub import login # Allows logging in to Hugging Face Hub to upload models\n", "\n", "# seqeval metrics for NER evaluation\n", "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", "# Provides precision, recall, F1-score, and classification report for evaluating NER model performance\n" ], "metadata": { "id": "amREIFSH-z7r" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Log in to Hugging Face Hub\n", "login(token=\"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\")\n" ], "metadata": { "id": "K7adlboI-z4p", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "88717ba2-23e2-4aff-d1cf-ca876f0f3d46" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", "Token is valid (permission: fineGrained).\n", "Your token has been saved to /root/.cache/huggingface/token\n", "Login successful\n" ] } ] }, { "cell_type": "code", "source": [ "# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training\n", "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", "\n", "# Suppress warning messages to keep output clean, especially during training and evaluation\n", "warnings.filterwarnings(\"ignore\")\n" ], "metadata": { "id": "Qccgsjfs-zzA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Load the Azerbaijani NER dataset from Hugging Face\n", "dataset = load_dataset(\"LocalDoc/azerbaijani-ner-dataset\")\n", "print(dataset) # Display dataset structure (e.g., train/validation splits)\n", "\n", "# Preprocessing function to format tokens and NER tags correctly\n", "def preprocess_example(example):\n", " try:\n", " # Convert string of tokens to a list and parse NER tags to integers\n", " example[\"tokens\"] = ast.literal_eval(example[\"tokens\"])\n", " example[\"ner_tags\"] = list(map(int, ast.literal_eval(example[\"ner_tags\"])))\n", " except (ValueError, SyntaxError) as e:\n", " # Skip and log malformed examples, ensuring error resilience\n", " print(f\"Skipping malformed example: {example['index']} due to error: {e}\")\n", " example[\"tokens\"] = []\n", " example[\"ner_tags\"] = []\n", " return example\n", "\n", "# Apply preprocessing to each dataset entry, ensuring consistent formatting\n", "dataset = dataset.map(preprocess_example)\n" ], "metadata": { "id": "fQ6ttUM8-zwM", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "362280bb-16c3-4462-f568-6eba09915ec1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['index', 'tokens', 'ner_tags'],\n", " num_rows: 99545\n", " })\n", "})\n" ] } ] }, { "cell_type": "code", "source": [ "# Initialize the tokenizer for multilingual NER using xlm-roberta-large\n", "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n", "\n", "# Function to tokenize input and align labels with tokenized words\n", "def tokenize_and_align_labels(example):\n", " # Tokenize the sentence while preserving word boundaries for correct NER tag alignment\n", " tokenized_inputs = tokenizer(\n", " example[\"tokens\"], # List of words (tokens) in the sentence\n", " truncation=True, # Truncate sentences longer than max_length\n", " is_split_into_words=True, # Specify that input is a list of words\n", " padding=\"max_length\", # Pad to maximum sequence length\n", " max_length=128, # Set the maximum sequence length to 128 tokens\n", " )\n", "\n", " labels = [] # List to store aligned NER labels\n", " word_ids = tokenized_inputs.word_ids() # Get word IDs for each token\n", " previous_word_idx = None # Initialize previous word index for tracking\n", "\n", " # Loop through word indices to align NER tags with subword tokens\n", " for word_idx in word_ids:\n", " if word_idx is None:\n", " labels.append(-100) # Set padding token labels to -100 (ignored in loss)\n", " elif word_idx != previous_word_idx:\n", " # Assign the label from example's NER tags if word index matches\n", " labels.append(example[\"ner_tags\"][word_idx] if word_idx < len(example[\"ner_tags\"]) else -100)\n", " else:\n", " labels.append(-100) # Label subword tokens with -100 to avoid redundant labels\n", " previous_word_idx = word_idx # Update previous word index\n", "\n", " tokenized_inputs[\"labels\"] = labels # Add labels to tokenized inputs\n", " return tokenized_inputs\n", "\n", "# Apply tokenization and label alignment function to the dataset\n", "tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)\n" ], "metadata": { "id": "-24SJijT-zth", "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "366e5a0ac67d4e0e94da459f3e69804e", "3c6cd74053f74ac18c4f5bbfb9a2fc69", "22d5df7f49b34fec91c7eb4e7e4ab33e", "25153fcf872048379de7c71420f3a581", "a1883d8b08cc458287224bc89aeb54d1", "e03078ea896e41e7bcd922afd77b83c9", "793237ce29034606b2b34bf559cd87da", "54177c30c7974ab9ac986cb9aa17793c", "95a31c2e01744ccca1fd1d07e1e99d19", "1d20d5f57db24eb59f4f633ee1443495", "31f2258ec506441e83752bfa67d53398", "4628c887a3404cb79319e2586cbf81af", "8ae15ae97e85478aaf8ff109349f419a", "adc84a2b4e54479d927ae5b253eb90c2", "549602a8d77241929793d70afa0d54b9", "5d1d0adb88b748e4859c71019a0cf8e2", "b0c34ffabd284318842c23cc4baba1cf", "b30aeec96e4d4826bab3c207561b4778", "10b114cb480141cbab6a26f9a89d2a7e", "3943a1720767453784dfaa6e9017afb2", "1d26be052e6d4d479a2c4c68f027a719", "5c35bb1be95e4d6c9736330953e045e3", "550652ab3d9f482ba2a5485cd84c939b", "09a5d2c99fb9434ab90b3200cd51a3ae", "b4dbc8e0dbd342d19c5f652a004bc765", "4dc271194c7648c8894dd510a69c103d", "4debd0c75c79416d917ea5641e4a8841", "7c644402f92b408182ab014e2ea02daa", "affe4914cd6f41e39124f093e36cdb07", "ea2d20664c5640ff87cd1b909800722c", "ca95df7382f2412b9328f96a463209a1", "186e1b0766044f718d2024645c4e87c7", "57d59fcaff5e466b8605b23887650cf7", "ce139b88df824efea4d55e4813ee1b88", "1fb3250b1b5540d8a9365435900db8b5", "675aa319a3504e22a9b1d58eff9188a2", "48e49cdb0ec8417782ed042ca84d4597", "f15259b4926d40b5a70ee8eb5213e9f5", "b0f1e42f5e4f4ac8b1c4ca12cfebabec", "41ea4f253b6b44129196e0d894777c4a", "ccf8fe1474d540a7be7b6757119d92fd", "34326871a80140679ffe40ac560192a9", "19987899825a49b19c31a7225d3ff0b8", "ed46a4b1d6b647fcaa01526262b19431" ] }, "outputId": "ddc67c6c-b931-466e-8da8-90c7ead34f0d" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00\n", " \n", " \n", " [6666/8400 50:10 < 13:03, 2.21 it/s, Epoch 9.52/12]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossPrecisionRecallF1
10.4075000.2538230.7689230.7213500.744377
20.2556000.2496940.7835490.7244640.752849
30.2144000.2487730.7508570.7489000.749877
40.1934000.2570510.7686230.7403710.754232
50.1698000.2756790.7457890.7537400.749743
60.1526000.2880740.7831310.7284230.754787
70.1443000.3033780.7585040.7380690.748147
80.1268000.3113000.7455890.7508630.748217
90.1194000.3316310.7393160.7494750.744361

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "metadata": { "tags": null }, "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " ART 0.64 0.12 0.20 1828\n", " DATE 0.50 0.49 0.49 834\n", " EVENT 0.63 0.46 0.53 63\n", " FACILITY 0.70 0.69 0.70 1134\n", " LAW 0.64 0.49 0.56 1066\n", " LOCATION 0.78 0.80 0.79 8795\n", " MONEY 0.62 0.51 0.56 555\n", "ORGANISATION 0.64 0.70 0.67 554\n", " PERCENTAGE 0.76 0.84 0.80 3502\n", " PERSON 0.89 0.81 0.85 7007\n", " PRODUCT 0.82 0.83 0.83 2624\n", " TIME 0.55 0.54 0.54 1584\n", "\n", " micro avg 0.77 0.72 0.74 29546\n", " macro avg 0.68 0.61 0.63 29546\n", "weighted avg 0.76 0.72 0.73 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.60 0.16 0.25 1828\n", " DATE 0.56 0.47 0.51 834\n", " EVENT 0.55 0.56 0.55 63\n", " FACILITY 0.75 0.66 0.70 1134\n", " LAW 0.61 0.57 0.59 1066\n", " LOCATION 0.80 0.78 0.79 8795\n", " MONEY 0.62 0.56 0.59 555\n", "ORGANISATION 0.66 0.66 0.66 554\n", " PERCENTAGE 0.78 0.84 0.81 3502\n", " PERSON 0.88 0.84 0.86 7007\n", " PRODUCT 0.81 0.86 0.83 2624\n", " TIME 0.61 0.47 0.53 1584\n", "\n", " micro avg 0.78 0.72 0.75 29546\n", " macro avg 0.68 0.62 0.64 29546\n", "weighted avg 0.77 0.72 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.41 0.22 0.28 1828\n", " DATE 0.47 0.52 0.49 834\n", " EVENT 0.64 0.51 0.57 63\n", " FACILITY 0.71 0.70 0.71 1134\n", " LAW 0.63 0.56 0.59 1066\n", " LOCATION 0.77 0.82 0.80 8795\n", " MONEY 0.58 0.61 0.59 555\n", "ORGANISATION 0.64 0.69 0.67 554\n", " PERCENTAGE 0.79 0.82 0.80 3502\n", " PERSON 0.84 0.86 0.85 7007\n", " PRODUCT 0.80 0.86 0.83 2624\n", " TIME 0.59 0.53 0.56 1584\n", "\n", " micro avg 0.75 0.75 0.75 29546\n", " macro avg 0.66 0.64 0.64 29546\n", "weighted avg 0.74 0.75 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.40 0.18 0.25 1828\n", " DATE 0.52 0.49 0.50 834\n", " EVENT 0.67 0.51 0.58 63\n", " FACILITY 0.78 0.63 0.70 1134\n", " LAW 0.63 0.60 0.61 1066\n", " LOCATION 0.78 0.81 0.80 8795\n", " MONEY 0.63 0.50 0.56 555\n", "ORGANISATION 0.64 0.66 0.65 554\n", " PERCENTAGE 0.79 0.83 0.81 3502\n", " PERSON 0.86 0.85 0.86 7007\n", " PRODUCT 0.81 0.87 0.84 2624\n", " TIME 0.59 0.53 0.56 1584\n", "\n", " micro avg 0.77 0.74 0.75 29546\n", " macro avg 0.68 0.62 0.64 29546\n", "weighted avg 0.75 0.74 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.39 0.20 0.27 1828\n", " DATE 0.51 0.51 0.51 834\n", " EVENT 0.66 0.59 0.62 63\n", " FACILITY 0.73 0.69 0.71 1134\n", " LAW 0.57 0.63 0.60 1066\n", " LOCATION 0.76 0.82 0.79 8795\n", " MONEY 0.59 0.57 0.58 555\n", "ORGANISATION 0.60 0.69 0.64 554\n", " PERCENTAGE 0.76 0.84 0.80 3502\n", " PERSON 0.86 0.84 0.85 7007\n", " PRODUCT 0.79 0.88 0.83 2624\n", " TIME 0.58 0.55 0.56 1584\n", "\n", " micro avg 0.75 0.75 0.75 29546\n", " macro avg 0.65 0.65 0.65 29546\n", "weighted avg 0.74 0.75 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.41 0.19 0.26 1828\n", " DATE 0.53 0.49 0.51 834\n", " EVENT 0.67 0.51 0.58 63\n", " FACILITY 0.74 0.68 0.71 1134\n", " LAW 0.62 0.58 0.60 1066\n", " LOCATION 0.81 0.79 0.80 8795\n", " MONEY 0.59 0.56 0.58 555\n", "ORGANISATION 0.70 0.69 0.70 554\n", " PERCENTAGE 0.80 0.82 0.81 3502\n", " PERSON 0.90 0.82 0.86 7007\n", " PRODUCT 0.83 0.84 0.84 2624\n", " TIME 0.60 0.53 0.57 1584\n", "\n", " micro avg 0.78 0.73 0.75 29546\n", " macro avg 0.68 0.63 0.65 29546\n", "weighted avg 0.77 0.73 0.75 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.34 0.21 0.26 1828\n", " DATE 0.51 0.51 0.51 834\n", " EVENT 0.61 0.52 0.56 63\n", " FACILITY 0.74 0.67 0.70 1134\n", " LAW 0.63 0.56 0.59 1066\n", " LOCATION 0.79 0.79 0.79 8795\n", " MONEY 0.57 0.55 0.56 555\n", "ORGANISATION 0.66 0.68 0.67 554\n", " PERCENTAGE 0.78 0.82 0.80 3502\n", " PERSON 0.86 0.85 0.85 7007\n", " PRODUCT 0.80 0.87 0.83 2624\n", " TIME 0.59 0.54 0.56 1584\n", "\n", " micro avg 0.76 0.74 0.75 29546\n", " macro avg 0.66 0.63 0.64 29546\n", "weighted avg 0.75 0.74 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.33 0.22 0.26 1828\n", " DATE 0.52 0.52 0.52 834\n", " EVENT 0.58 0.52 0.55 63\n", " FACILITY 0.74 0.69 0.71 1134\n", " LAW 0.59 0.61 0.60 1066\n", " LOCATION 0.77 0.82 0.79 8795\n", " MONEY 0.56 0.59 0.57 555\n", "ORGANISATION 0.65 0.68 0.66 554\n", " PERCENTAGE 0.79 0.81 0.80 3502\n", " PERSON 0.86 0.86 0.86 7007\n", " PRODUCT 0.82 0.87 0.84 2624\n", " TIME 0.57 0.56 0.56 1584\n", "\n", " micro avg 0.75 0.75 0.75 29546\n", " macro avg 0.65 0.65 0.64 29546\n", "weighted avg 0.74 0.75 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.31 0.22 0.26 1828\n", " DATE 0.50 0.54 0.52 834\n", " EVENT 0.57 0.56 0.56 63\n", " FACILITY 0.72 0.69 0.71 1134\n", " LAW 0.57 0.63 0.60 1066\n", " LOCATION 0.77 0.81 0.79 8795\n", " MONEY 0.51 0.62 0.56 555\n", "ORGANISATION 0.64 0.69 0.66 554\n", " PERCENTAGE 0.78 0.81 0.80 3502\n", " PERSON 0.86 0.84 0.85 7007\n", " PRODUCT 0.81 0.86 0.83 2624\n", " TIME 0.56 0.58 0.57 1584\n", "\n", " micro avg 0.74 0.75 0.74 29546\n", " macro avg 0.63 0.65 0.64 29546\n", "weighted avg 0.73 0.75 0.74 29546\n", "\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "

\n", " \n", " \n", " [7700/8400 58:30 < 05:19, 2.19 it/s, Epoch 11/12]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossPrecisionRecallF1
10.4075000.2538230.7689230.7213500.744377
20.2556000.2496940.7835490.7244640.752849
30.2144000.2487730.7508570.7489000.749877
40.1934000.2570510.7686230.7403710.754232
50.1698000.2756790.7457890.7537400.749743
60.1526000.2880740.7831310.7284230.754787
70.1443000.3033780.7585040.7380690.748147
80.1268000.3113000.7455890.7508630.748217
90.1194000.3316310.7393160.7494750.744361
100.1094000.3448230.7542680.7371890.745631
110.1029000.3548870.7519480.7412850.746578

" ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " ART 0.30 0.21 0.25 1828\n", " DATE 0.52 0.52 0.52 834\n", " EVENT 0.63 0.54 0.58 63\n", " FACILITY 0.73 0.70 0.71 1134\n", " LAW 0.60 0.59 0.60 1066\n", " LOCATION 0.79 0.79 0.79 8795\n", " MONEY 0.55 0.60 0.57 555\n", "ORGANISATION 0.64 0.68 0.66 554\n", " PERCENTAGE 0.78 0.82 0.80 3502\n", " PERSON 0.87 0.84 0.85 7007\n", " PRODUCT 0.83 0.84 0.83 2624\n", " TIME 0.58 0.56 0.57 1584\n", "\n", " micro avg 0.75 0.74 0.75 29546\n", " macro avg 0.65 0.64 0.65 29546\n", "weighted avg 0.75 0.74 0.74 29546\n", "\n", " precision recall f1-score support\n", "\n", " ART 0.32 0.22 0.26 1828\n", " DATE 0.51 0.52 0.51 834\n", " EVENT 0.64 0.54 0.59 63\n", " FACILITY 0.73 0.69 0.71 1134\n", " LAW 0.60 0.59 0.60 1066\n", " LOCATION 0.79 0.80 0.79 8795\n", " MONEY 0.53 0.58 0.55 555\n", "ORGANISATION 0.65 0.68 0.66 554\n", " PERCENTAGE 0.79 0.82 0.80 3502\n", " PERSON 0.87 0.84 0.85 7007\n", " PRODUCT 0.83 0.85 0.84 2624\n", " TIME 0.58 0.57 0.57 1584\n", "\n", " micro avg 0.75 0.74 0.75 29546\n", " macro avg 0.65 0.64 0.65 29546\n", "weighted avg 0.74 0.74 0.74 29546\n", "\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "

\n", " \n", " \n", " [78/78 00:10]\n", "
\n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " ART 0.41 0.19 0.26 1828\n", " DATE 0.53 0.49 0.51 834\n", " EVENT 0.67 0.51 0.58 63\n", " FACILITY 0.74 0.68 0.71 1134\n", " LAW 0.62 0.58 0.60 1066\n", " LOCATION 0.81 0.79 0.80 8795\n", " MONEY 0.59 0.56 0.58 555\n", "ORGANISATION 0.70 0.69 0.70 554\n", " PERCENTAGE 0.80 0.82 0.81 3502\n", " PERSON 0.90 0.82 0.86 7007\n", " PRODUCT 0.83 0.84 0.84 2624\n", " TIME 0.60 0.53 0.57 1584\n", "\n", " micro avg 0.78 0.73 0.75 29546\n", " macro avg 0.68 0.63 0.65 29546\n", "weighted avg 0.77 0.73 0.75 29546\n", "\n", "{'eval_loss': 0.28807422518730164, 'eval_precision': 0.7831307765082599, 'eval_recall': 0.7284234752589183, 'eval_f1': 0.754787122115452, 'eval_runtime': 16.1047, 'eval_samples_per_second': 618.142, 'eval_steps_per_second': 4.843, 'epoch': 11.0}\n" ] } ] }, { "cell_type": "code", "source": [ "# Define the directory where the trained model and tokenizer will be saved\n", "save_directory = \"./xlm-roberta-large\"\n", "\n", "# Save the trained model to the specified directory\n", "model.save_pretrained(save_directory)\n", "\n", "# Save the tokenizer to the same directory for compatibility with the model\n", "tokenizer.save_pretrained(save_directory)\n" ], "metadata": { "id": "7yEFe2_n-zPG", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "d8184694-0ab9-44e4-9b4e-859cd2ea6188" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('./xlm-roberta-large/tokenizer_config.json',\n", " './xlm-roberta-large/special_tokens_map.json',\n", " './xlm-roberta-large/sentencepiece.bpe.model',\n", " './xlm-roberta-large/added_tokens.json',\n", " './xlm-roberta-large/tokenizer.json')" ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "from transformers import pipeline\n", "\n", "# Load tokenizer and model\n", "tokenizer = AutoTokenizer.from_pretrained(save_directory)\n", "model = AutoModelForTokenClassification.from_pretrained(save_directory)\n", "\n", "# Initialize the NER pipeline\n", "device = 0 if torch.cuda.is_available() else -1\n", "nlp_ner = pipeline(\"ner\", model=model, tokenizer=tokenizer, aggregation_strategy=\"simple\", device=device)\n" ], "metadata": { "id": "zkECg3v9-zNQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "label_mapping = {f\"LABEL_{i}\": label for i, label in enumerate(label_list) if label != \"O\"}\n", "\n", "def evaluate_model(test_texts, true_labels):\n", " predictions = []\n", " for i, text in enumerate(test_texts):\n", " pred_entities = nlp_ner(text)\n", " pred_labels = [label_mapping.get(entity[\"entity_group\"], \"O\") for entity in pred_entities if entity[\"entity_group\"] in label_mapping]\n", " if len(pred_labels) != len(true_labels[i]):\n", " print(f\"Warning: Inconsistent number of entities in sample {i+1}. Adjusting predicted entities.\")\n", " pred_labels = pred_labels[:len(true_labels[i])]\n", " predictions.append(pred_labels)\n", " if all(len(true) == len(pred) for true, pred in zip(true_labels, predictions)):\n", " precision = precision_score(true_labels, predictions)\n", " recall = recall_score(true_labels, predictions)\n", " f1 = f1_score(true_labels, predictions)\n", " print(\"Precision:\", precision)\n", " print(\"Recall:\", recall)\n", " print(\"F1-Score:\", f1)\n", " print(classification_report(true_labels, predictions))\n", " else:\n", " print(\"Error: Could not align all samples correctly for evaluation.\")\n" ], "metadata": { "id": "SOFqXU-M_bxO" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "test_texts = [\"Shahla Khuduyeva və Pasha Sığorta şirkəti haqqında məlumat.\"]\n", "true_labels = [[\"B-PERSON\", \"B-ORGANISATION\"]]\n", "evaluate_model(test_texts, true_labels)\n" ], "metadata": { "id": "WRCB-_66_buE", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "da8833c8-27e8-40cc-d32f-4eb11158278d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Precision: 0.5\n", "Recall: 0.5\n", "F1-Score: 0.5\n", " precision recall f1-score support\n", "\n", " LOCATION 0.00 0.00 0.00 0\n", "ORGANISATION 0.00 0.00 0.00 1\n", " PERSON 1.00 1.00 1.00 1\n", "\n", " micro avg 0.50 0.50 0.50 2\n", " macro avg 0.33 0.33 0.33 2\n", "weighted avg 0.50 0.50 0.50 2\n", "\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "x53zS3Vv_brU" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "5Uoebirj_boo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "RKounG2l_bl5" }, "execution_count": null, "outputs": [] } ] }