{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "f7b7943b77ec48709ab2c14ff9c7d8d0": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8ccb497ccc254b4fbb83a6075d999c98", "IPY_MODEL_462e849b058049eb90bdec861c4c6733", "IPY_MODEL_433354ec1982417b8e182d982451cff8" ], "layout": "IPY_MODEL_79e9f2981ea4421a864c14f0c3c47c88" } }, "8ccb497ccc254b4fbb83a6075d999c98": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1f7ddf0908aa4a4bbf20c0a98c534802", "placeholder": "​", "style": "IPY_MODEL_a62aa5c3bacd439b88b66448048054de", "value": "config.json: 100%" } }, "462e849b058049eb90bdec861c4c6733": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_29f23229640c4a3b978c630bed86b0a0", "max": 725, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_65e51c3b78c14a1db0b8b730534aa5b9", "value": 725 } }, "433354ec1982417b8e182d982451cff8": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_108e829acb474ad7b9b5d4576fbfd340", "placeholder": "​", "style": "IPY_MODEL_7f94536dab254a8ebbce57f65a99511d", "value": " 725/725 [00:00<00:00, 30.0kB/s]" } }, "79e9f2981ea4421a864c14f0c3c47c88": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1f7ddf0908aa4a4bbf20c0a98c534802": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a62aa5c3bacd439b88b66448048054de": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "29f23229640c4a3b978c630bed86b0a0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "65e51c3b78c14a1db0b8b730534aa5b9": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "108e829acb474ad7b9b5d4576fbfd340": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7f94536dab254a8ebbce57f65a99511d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e2dc2d66fe9d4c068b7400aa68166d82": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_c4fdbaab18b04bd092bc8327dc9384f9", "IPY_MODEL_76d175d827dc479da41cf4d195176a73", "IPY_MODEL_02d1a242c3c346f9ba00d4ef54632080" ], "layout": "IPY_MODEL_3afe49394a7b4e63abbfa6eea620054b" } }, "c4fdbaab18b04bd092bc8327dc9384f9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5cc5407d5562422b8b4306b9b4540ab8", "placeholder": "​", "style": "IPY_MODEL_2bbd4694331d42b69a279571576deb51", "value": "pytorch_model.bin: 100%" } }, "76d175d827dc479da41cf4d195176a73": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_23c88c6f258a402288b6ffe041f61f0b", "max": 1425820242, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_8c75ac06d39f4bf58c4b2cdf513c47a1", "value": 1425820242 } }, "02d1a242c3c346f9ba00d4ef54632080": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ac9cf05fb66c4b28beebe6ce874a296b", "placeholder": "​", "style": "IPY_MODEL_5180b588b5f546a1a5c1b99842ad9bc4", "value": " 1.43G/1.43G [00:13<00:00, 119MB/s]" } }, "3afe49394a7b4e63abbfa6eea620054b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5cc5407d5562422b8b4306b9b4540ab8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2bbd4694331d42b69a279571576deb51": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "23c88c6f258a402288b6ffe041f61f0b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8c75ac06d39f4bf58c4b2cdf513c47a1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "ac9cf05fb66c4b28beebe6ce874a296b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5180b588b5f546a1a5c1b99842ad9bc4": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "536c41525a424101a64e8c3c7fe6a1a7": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_c95bb7e9112f49bbb4b266fe7462586b", "IPY_MODEL_3d53fbf327a74eccbdeb2bdd7e0e9ea6", "IPY_MODEL_f15764b0fbcc470da3735693598b03d0" ], "layout": "IPY_MODEL_6b9d84472ca14e869b0eedfd5faa853e" } }, "c95bb7e9112f49bbb4b266fe7462586b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e7c76fe4370b4674a6ada68197e5b6e1", "placeholder": "​", "style": "IPY_MODEL_521b25ffe3e6414d918d9c38dc0f6251", "value": "tokenizer_config.json: 100%" } }, "3d53fbf327a74eccbdeb2bdd7e0e9ea6": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_49e97c7601b1484cb32de9a14b66a80e", "max": 1070, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a60aa23254dc402397580e8cfa14dde2", "value": 1070 } }, "f15764b0fbcc470da3735693598b03d0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_97016178d3b249398e3bbbd6902a1f45", "placeholder": "​", "style": "IPY_MODEL_e4203f38f1e34e9baa953248c5d04867", "value": " 1.07k/1.07k [00:00<00:00, 52.3kB/s]" } }, "6b9d84472ca14e869b0eedfd5faa853e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e7c76fe4370b4674a6ada68197e5b6e1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "521b25ffe3e6414d918d9c38dc0f6251": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "49e97c7601b1484cb32de9a14b66a80e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a60aa23254dc402397580e8cfa14dde2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "97016178d3b249398e3bbbd6902a1f45": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e4203f38f1e34e9baa953248c5d04867": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "55fee200080e4ffa842c7b3ccba6202c": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_681e215549a64a2d86046c20f943accb", "IPY_MODEL_585daf7d5699401284c3fe9991cfbd1d", "IPY_MODEL_0754f254add24b1bbac53908ace0674e" ], "layout": "IPY_MODEL_1c5dd19c1b994e20b6286f921d6e3a08" } }, "681e215549a64a2d86046c20f943accb": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c05c707a897242cebb1844ba012ba823", "placeholder": "​", "style": "IPY_MODEL_3d05b61f2061429095a1a2d4dc7500dd", "value": "vocab.json: 100%" } }, "585daf7d5699401284c3fe9991cfbd1d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a8e4bf0b21184ccf9951bdb9114fd10e", "max": 898822, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_96934e1a081e480e9608f355cca41566", "value": 898822 } }, "0754f254add24b1bbac53908ace0674e": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_68f47edfbd5d4bacaa3e6424c98f8961", "placeholder": "​", "style": "IPY_MODEL_ffa81a80e84546148817330dd5c9eefa", "value": " 899k/899k [00:00<00:00, 3.38MB/s]" } }, "1c5dd19c1b994e20b6286f921d6e3a08": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c05c707a897242cebb1844ba012ba823": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3d05b61f2061429095a1a2d4dc7500dd": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a8e4bf0b21184ccf9951bdb9114fd10e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "96934e1a081e480e9608f355cca41566": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "68f47edfbd5d4bacaa3e6424c98f8961": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ffa81a80e84546148817330dd5c9eefa": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c9d0683b6fb64df1bc370de4011fe77f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d1dc02d934cc417b983dc13d80b22d31", "IPY_MODEL_cad51788127f440393bea3c2d165f42e", "IPY_MODEL_2eaee4ea34a647259f015fbd9190e821" ], "layout": "IPY_MODEL_710c585a886d4d17bd66ebef545b3371" } }, "d1dc02d934cc417b983dc13d80b22d31": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f2130c9ea479452699ce1389a74097e6", "placeholder": "​", "style": "IPY_MODEL_f7db554186144eb0807f71fc142580fa", "value": "merges.txt: 100%" } }, "cad51788127f440393bea3c2d165f42e": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0c61a26cae2d4e2f851be13f51ee78c3", "max": 456318, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_ec49ebfb9b16430181b4738b01844692", "value": 456318 } }, "2eaee4ea34a647259f015fbd9190e821": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b6f81ce13f774cc1a42ec659d50bfca3", "placeholder": "​", "style": "IPY_MODEL_ffc2e024df9241de9d7c22269a47fe96", "value": " 456k/456k [00:00<00:00, 3.42MB/s]" } }, "710c585a886d4d17bd66ebef545b3371": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f2130c9ea479452699ce1389a74097e6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f7db554186144eb0807f71fc142580fa": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0c61a26cae2d4e2f851be13f51ee78c3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ec49ebfb9b16430181b4738b01844692": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b6f81ce13f774cc1a42ec659d50bfca3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ffc2e024df9241de9d7c22269a47fe96": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c7581989b1d84bc5b453551440671767": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_f841b17df29b42799c740b305f44c3f5", "IPY_MODEL_fb3820553c8246f8a3f19d11f2929e26", "IPY_MODEL_9a0be6bf09a943dba3582e78661b64ce" ], "layout": "IPY_MODEL_97881b93a47e470eb9673ef97a28d741" } }, "f841b17df29b42799c740b305f44c3f5": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_696a8fb7186e4ad9bed904d79cc5da17", "placeholder": "​", "style": "IPY_MODEL_fb483bee70b04cd1bc2cef99941dfbe8", "value": "special_tokens_map.json: 100%" } }, "fb3820553c8246f8a3f19d11f2929e26": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_dc610f57bb0d42a4970fd56e2beafd3e", "max": 772, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_77f9434bf9984bbf9d7e0b409f58c236", "value": 772 } }, "9a0be6bf09a943dba3582e78661b64ce": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1b7e9249af884ecd89bdfe4da708ac38", "placeholder": "​", "style": "IPY_MODEL_c3e369bcece54aa4bb6baba02f6c4c30", "value": " 772/772 [00:00<00:00, 35.6kB/s]" } }, "97881b93a47e470eb9673ef97a28d741": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "696a8fb7186e4ad9bed904d79cc5da17": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fb483bee70b04cd1bc2cef99941dfbe8": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "dc610f57bb0d42a4970fd56e2beafd3e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "77f9434bf9984bbf9d7e0b409f58c236": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1b7e9249af884ecd89bdfe4da708ac38": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c3e369bcece54aa4bb6baba02f6c4c30": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "\n", "# Preprocess text data\n", "df['tweet'] = df['tweet'].str.lower()\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub('@[^\\s]+', ' ', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub(f'[{string.punctuation}]', ' ', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub('[0-9]+', '', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(str(x).split()))\n", "df['tweet'] = df['tweet'].apply(lambda x: [w for w in x.split() if w not in stopwords.words('english')])\n", "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x))\n", "df['tweet'] = df['tweet'].apply(lambda x: WordNetLemmatizer().lemmatize(x))\n", "\n", "# Fit the bag of words transformer to the text column\n", "bow_transformer = CountVectorizer().fit(df['tweet'])\n", "\n", "# Transform the text column to bag of words representation\n", "text_bow = bow_transformer.transform(df['tweet'])\n", "\n", "# Apply Tf-Idf transformer to the bag of words representation\n", "tfidf_transformer = TfidfTransformer().fit(text_bow)\n", "text_tfidf = tfidf_transformer.transform(text_bow)\n", "\n", "# Split the data into train and test sets\n", "X_train, X_test, y_train, y_test = train_test_split(text_tfidf, df['label'], test_size=0.2, random_state=42)\n", "\n", "# Initialize and train the model\n", "model = LogisticRegression()\n", "model.fit(X_train, y_train)\n", "\n", "# Predict using the test set\n", "y_pred = model.predict(X_test)\n", "\n", "# Calculate accuracy\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy:\", accuracy)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ELI_EP93ws0o", "outputId": "0bae90bc-d511-4888-c507-cc5908f38a82" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy: 0.677\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.model_selection import train_test_split, KFold\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from xgboost import XGBClassifier\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "\n", "# Preprocess text data\n", "df['tweet'] = df['tweet'].str.lower()\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub('@[^\\s]+', ' ', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub(f'[{string.punctuation}]', ' ', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: re.sub('[0-9]+', '', x))\n", "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(str(x).split()))\n", "df['tweet'] = df['tweet'].apply(lambda x: [w for w in x.split() if w not in stopwords.words('english')])\n", "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x))\n", "df['tweet'] = df['tweet'].apply(lambda x: WordNetLemmatizer().lemmatize(x))\n", "\n", "# Split the data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Initialize models\n", "models = {\n", " 'Logistic Regression': LogisticRegression(),\n", " 'Random Forest': RandomForestClassifier(),\n", " 'Support Vector Machine': SVC(),\n", " 'Multinomial Naive Bayes': MultinomialNB(),\n", " 'XGBoost': XGBClassifier()\n", "}\n", "\n", "# Apply K-Fold cross-validation\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "for model_name, model in models.items():\n", " print(f\"Training {model_name}:\")\n", " accuracies = []\n", " for train_index, test_index in kf.split(X):\n", " X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n", " y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n", "\n", " # Fit the bag of words transformer to the text column\n", " bow_transformer = CountVectorizer().fit(X_train)\n", " # Transform the text column to bag of words representation\n", " text_bow_train = bow_transformer.transform(X_train)\n", " text_bow_test = bow_transformer.transform(X_test)\n", "\n", " # Apply Tf-Idf transformer to the bag of words representation\n", " tfidf_transformer = TfidfTransformer().fit(text_bow_train)\n", " text_tfidf_train = tfidf_transformer.transform(text_bow_train)\n", " text_tfidf_test = tfidf_transformer.transform(text_bow_test)\n", "\n", " # Train the model\n", " model.fit(text_tfidf_train, y_train)\n", "\n", " # Predict using the test set\n", " y_pred = model.predict(text_tfidf_test)\n", "\n", " # Calculate accuracy\n", " accuracy = accuracy_score(y_test, y_pred)\n", " accuracies.append(accuracy)\n", " print(f\" - Fold accuracy: {accuracy}\")\n", "\n", " # Average accuracy across all folds\n", " avg_accuracy = np.mean(accuracies)\n", " print(f\"{model_name} average accuracy: {avg_accuracy}\\n\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 369 }, "id": "mo68I9qZ2zqB", "outputId": "3bff4613-0bfd-4d46-dd0c-3b70b976488c" }, "execution_count": null, "outputs": [ { "output_type": "error", "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: '/content/drive/MyDrive/train (1).csv'", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;31m# Read the CSV file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive/MyDrive/train (1).csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;31m# Preprocess text data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 910\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 912\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 913\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 914\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 576\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 577\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 578\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1405\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1406\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1407\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1409\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1660\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1661\u001b[0;31m self.handles = get_handle(\n\u001b[0m\u001b[1;32m 1662\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1663\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m handle = open(\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/drive/MyDrive/train (1).csv'" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "\n", "# Preprocess text column\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Split data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Initialize models\n", "models = {\n", " 'Random Forest': RandomForestClassifier(),\n", " 'Logistic Regression': LogisticRegression(),\n", " 'Support Vector Machine': SVC(),\n", " 'Multinomial Naive Bayes': MultinomialNB()\n", "}\n", "\n", "# Define hyperparameters grid for each model\n", "param_grid = {\n", " 'Random Forest': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20]},\n", " 'Logistic Regression': {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},\n", " 'Support Vector Machine': {'classifier__C': [0.1, 1, 10], 'classifier__gamma': ['scale', 'auto']},\n", " 'Multinomial Naive Bayes': {'classifier__alpha': [0.1, 0.5, 1.0]},\n", "}\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Perform hyperparameter tuning for each model\n", "for model_name, model in models.items():\n", " print(f\"Tuning hyperparameters for {model_name}:\")\n", "\n", " # Define pipeline with TfidfVectorizer and model\n", " pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', model)\n", " ])\n", "\n", " # Perform grid search\n", " grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=kf, scoring='accuracy', n_jobs=-1)\n", " grid_search.fit(X, y)\n", "\n", " # Print best parameters and best cross-validation accuracy\n", " best_params = grid_search.best_params_\n", " best_score = grid_search.best_score_\n", " print(f\"Best parameters: {best_params}\")\n", " print(f\"Best cross-validation accuracy: {best_score}\\n\")\n", "\n", "# Choose the best model\n", "best_model_name = max(models, key=lambda k: grid_search.cv_results_['mean_test_score'][np.argwhere(grid_search.cv_results_['rank_test_score'] == 1)[0][0]])\n", "best_model = grid_search.best_estimator_\n", "\n", "# Fit the best model\n", "best_model.fit(X, y)\n", "\n", "# Predict using the best model\n", "y_pred = best_model.predict(X)\n", "\n", "# Calculate accuracy\n", "accuracy = accuracy_score(y, y_pred)\n", "print(\"Final Accuracy:\", accuracy)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YxlqbMM74t1m", "outputId": "ecde4844-a8bd-42f9-ff3b-b88599c5c0cc" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tuning hyperparameters for Random Forest:\n", "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}\n", "Best cross-validation accuracy: 0.6992999999999999\n", "\n", "Tuning hyperparameters for Logistic Regression:\n", "Best parameters: {'classifier__C': 10}\n", "Best cross-validation accuracy: 0.7055\n", "\n", "Tuning hyperparameters for Support Vector Machine:\n", "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n", "Best cross-validation accuracy: 0.7144999999999999\n", "\n", "Tuning hyperparameters for Multinomial Naive Bayes:\n", "Best parameters: {'classifier__alpha': 0.1}\n", "Best cross-validation accuracy: 0.6957\n", "\n", "Final Accuracy: 0.9004\n" ] } ] }, { "cell_type": "code", "source": [ "#فصل اعدادات كل موديل\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "\n", "# Preprocess text column\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Split data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Define hyperparameters grid for Random Forest\n", "param_grid_rf = {\n", " 'classifier__n_estimators': [50, 100, 200],\n", " 'classifier__max_depth': [None, 10, 20]\n", "}\n", "\n", "# Define hyperparameters grid for Logistic Regression\n", "param_grid_lr = {\n", " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", "}\n", "\n", "# Define hyperparameters grid for Support Vector Machine\n", "param_grid_svm = {\n", " 'classifier__C': [0.1, 1, 10],\n", " 'classifier__gamma': ['scale', 'auto']\n", "}\n", "\n", "# Define hyperparameters grid for Multinomial Naive Bayes\n", "param_grid_nb = {\n", " 'classifier__alpha': [0.1, 0.5, 1.0]\n", "}\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Random Forest\n", "print(\"Tuning hyperparameters for Random Forest:\")\n", "pipeline_rf = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())\n", "])\n", "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_rf.fit(X, y)\n", "best_params_rf = grid_search_rf.best_params_\n", "best_score_rf = grid_search_rf.best_score_\n", "print(f\"Best parameters: {best_params_rf}\")\n", "print(f\"Best cross-validation accuracy: {best_score_rf}\\n\")\n", "\n", "# Logistic Regression\n", "print(\"Tuning hyperparameters for Logistic Regression:\")\n", "pipeline_lr = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', LogisticRegression())\n", "])\n", "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_lr.fit(X, y)\n", "best_params_lr = grid_search_lr.best_params_\n", "best_score_lr = grid_search_lr.best_score_\n", "print(f\"Best parameters: {best_params_lr}\")\n", "print(f\"Best cross-validation accuracy: {best_score_lr}\\n\")\n", "\n", "# Support Vector Machine\n", "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", "pipeline_svm = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', SVC())\n", "])\n", "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_svm.fit(X, y)\n", "best_params_svm = grid_search_svm.best_params_\n", "best_score_svm = grid_search_svm.best_score_\n", "print(f\"Best parameters: {best_params_svm}\")\n", "print(f\"Best cross-validation accuracy: {best_score_svm}\\n\")\n", "\n", "# Multinomial Naive Bayes\n", "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", "pipeline_nb = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', MultinomialNB())\n", "])\n", "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_nb.fit(X, y)\n", "best_params_nb = grid_search_nb.best_params_\n", "best_score_nb = grid_search_nb.best_score_\n", "print(f\"Best parameters: {best_params_nb}\")\n", "print(f\"Best cross-validation accuracy: {best_score_nb}\\n\")\n", "\n", "# Choose the best model\n", "best_model_name = max({\n", " 'Random Forest': best_score_rf,\n", " 'Logistic Regression': best_score_lr,\n", " 'Support Vector Machine': best_score_svm,\n", " 'Multinomial Naive Bayes': best_score_nb\n", "}.items(), key=lambda x: x[1])[0]\n", "best_model = None\n", "\n", "if best_model_name == 'Random Forest':\n", " best_model = grid_search_rf.best_estimator_\n", "elif best_model_name == 'Logistic Regression':\n", " best_model = grid_search_lr.best_estimator_\n", "elif best_model_name == 'Support Vector Machine':\n", " best_model = grid_search_svm.best_estimator_\n", "elif best_model_name == 'Multinomial Naive Bayes':\n", " best_model = grid_search_nb.best_estimator_\n", "\n", "# Fit the best model\n", "if best_model:\n", " best_model.fit(X, y)\n", " # Predict using the best model\n", " y_pred = best_model.predict(X)\n", " # Calculate accuracy\n", " accuracy = accuracy_score(y, y_pred)\n", " print(\"Final Accuracy:\", accuracy)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eOKgOLHmFc8u", "outputId": "93efecb0-a9c5-4497-9544-2959fa534f4d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tuning hyperparameters for Random Forest:\n", "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}\n", "Best cross-validation accuracy: 0.6977\n", "\n", "Tuning hyperparameters for Logistic Regression:\n", "Best parameters: {'classifier__C': 10}\n", "Best cross-validation accuracy: 0.7055\n", "\n", "Tuning hyperparameters for Support Vector Machine:\n", "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n", "Best cross-validation accuracy: 0.7144999999999999\n", "\n", "Tuning hyperparameters for Multinomial Naive Bayes:\n", "Best parameters: {'classifier__alpha': 0.1}\n", "Best cross-validation accuracy: 0.6957\n", "\n", "Final Accuracy: 0.9954\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "import time\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "df # إظهار البيانات\n", "\n", "# Preprocess text column\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Split data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Define hyperparameters grid for Random Forest\n", "param_grid_rf = {\n", " 'classifier__n_estimators': [50, 100, 200],\n", " 'classifier__max_depth': [None, 10, 20]\n", "}\n", "\n", "# Define hyperparameters grid for Logistic Regression\n", "param_grid_lr = {\n", " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", "}\n", "\n", "# Define hyperparameters grid for Support Vector Machine\n", "param_grid_svm = {\n", " 'classifier__C': [0.1, 1, 10],\n", " 'classifier__gamma': ['scale', 'auto']\n", "}\n", "\n", "# Define hyperparameters grid for Multinomial Naive Bayes\n", "param_grid_nb = {\n", " 'classifier__alpha': [0.1, 0.5, 1.0]\n", "}\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Random Forest\n", "print(\"Tuning hyperparameters for Random Forest:\")\n", "pipeline_rf = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())\n", "])\n", "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_rf.fit(X, y)\n", "best_params_rf = grid_search_rf.best_params_\n", "best_score_rf = grid_search_rf.best_score_\n", "print(f\"Best parameters: {best_params_rf}\")\n", "print(f\"Best cross-validation accuracy: {best_score_rf}\\n\")\n", "\n", "# Logistic Regression\n", "print(\"Tuning hyperparameters for Logistic Regression:\")\n", "pipeline_lr = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', LogisticRegression())\n", "])\n", "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_lr.fit(X, y)\n", "best_params_lr = grid_search_lr.best_params_\n", "best_score_lr = grid_search_lr.best_score_\n", "print(f\"Best parameters: {best_params_lr}\")\n", "print(f\"Best cross-validation accuracy: {best_score_lr}\\n\")\n", "\n", "# Support Vector Machine\n", "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", "pipeline_svm = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', SVC())\n", "])\n", "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_svm.fit(X, y)\n", "best_params_svm = grid_search_svm.best_params_\n", "best_score_svm = grid_search_svm.best_score_\n", "print(f\"Best parameters: {best_params_svm}\")\n", "print(f\"Best cross-validation accuracy: {best_score_svm}\\n\")\n", "\n", "# Multinomial Naive Bayes\n", "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", "pipeline_nb = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', MultinomialNB())\n", "])\n", "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_nb.fit(X, y)\n", "best_params_nb = grid_search_nb.best_params_\n", "best_score_nb = grid_search_nb.best_score_\n", "print(f\"Best parameters: {best_params_nb}\")\n", "print(f\"Best cross-validation accuracy: {best_score_nb}\\n\")\n", "\n", "# Choose the best model\n", "best_model_name = max({\n", " 'Random Forest': best_score_rf,\n", " 'Logistic Regression': best_score_lr,\n", " 'Support Vector Machine': best_score_svm,\n", " 'Multinomial Naive Bayes': best_score_nb\n", "}.items(), key=lambda x: x[1])[0]\n", "best_model = None\n", "\n", "if best_model_name == 'Random Forest':\n", " best_model = grid_search_rf.best_estimator_\n", "elif best_model_name == 'Logistic Regression':\n", " best_model = grid_search_lr.best_estimator_\n", "elif best_model_name == 'Support Vector Machine':\n", " best_model = grid_search_svm.best_estimator_\n", "elif best_model_name == 'Multinomial Naive Bayes':\n", " best_model = grid_search_nb.best_estimator_\n", "\n", "# Fit the best model\n", "if best_model:\n", " best_model.fit(X, y)\n", " # Predict using the best model\n", " y_pred = best_model.predict(X)\n", " # Calculate accuracy\n", " accuracy = accuracy_score(y, y_pred)\n", " print(\"Final Accuracy:\", accuracy)\n", "\n", " # Calculate and print confusion matrix and classification report\n", " cm = confusion_matrix(y, y_pred)\n", " cr = classification_report(y, y_pred)\n", " print(\"Confusion Matrix:\")\n", " print(cm)\n", " print(\"Classification Report:\")\n", " print(cr)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xOXg8UEbC-kj", "outputId": "409ac90e-a68d-451b-b299-b3322f54803f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", "Tuning hyperparameters for Random Forest:\n", "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}\n", "Best cross-validation accuracy: 0.6995999999999999\n", "\n", "Tuning hyperparameters for Logistic Regression:\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Best parameters: {'classifier__C': 10}\n", "Best cross-validation accuracy: 0.7055\n", "\n", "Tuning hyperparameters for Support Vector Machine:\n", "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n", "Best cross-validation accuracy: 0.7144999999999999\n", "\n", "Tuning hyperparameters for Multinomial Naive Bayes:\n", "Best parameters: {'classifier__alpha': 0.1}\n", "Best cross-validation accuracy: 0.6957\n", "\n", "Final Accuracy: 0.9954\n", "Confusion Matrix:\n", "[[2423 8 1]\n", " [ 4 4966 12]\n", " [ 0 21 2565]]\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " -1 1.00 1.00 1.00 2432\n", " 0 0.99 1.00 1.00 4982\n", " 1 0.99 0.99 0.99 2586\n", "\n", " accuracy 1.00 10000\n", " macro avg 1.00 0.99 1.00 10000\n", "weighted avg 1.00 1.00 1.00 10000\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "#الكود الصح مبدئياً\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "import time\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "df # إظهار البيانات\n", "\n", "# Preprocess text column\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Split data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Define hyperparameters grid for Random Forest\n", "param_grid_rf = {\n", " 'classifier__n_estimators': [50, 100, 200],\n", " 'classifier__max_depth': [None, 10, 20]\n", "}\n", "\n", "# Define hyperparameters grid for Logistic Regression\n", "param_grid_lr = {\n", " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", "}\n", "\n", "# Define hyperparameters grid for Support Vector Machine\n", "param_grid_svm = {\n", " 'classifier__C': [0.1, 1, 10],\n", " 'classifier__gamma': ['scale', 'auto']\n", "}\n", "\n", "# Define hyperparameters grid for Multinomial Naive Bayes\n", "param_grid_nb = {\n", " 'classifier__alpha': [0.1, 0.5, 1.0]\n", "}\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Function to display results in a DataFrame\n", "def display_results(results):\n", " df_results = pd.DataFrame(results)\n", " display(df_results)\n", "\n", "# Initialize an empty list to store results\n", "results = []\n", "\n", "# Random Forest\n", "print(\"Tuning hyperparameters for Random Forest:\")\n", "pipeline_rf = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())\n", "])\n", "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_rf.fit(X, y)\n", "best_params_rf = grid_search_rf.best_params_\n", "best_score_rf = grid_search_rf.best_score_\n", "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", "\n", "# Logistic Regression\n", "print(\"Tuning hyperparameters for Logistic Regression:\")\n", "pipeline_lr = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', LogisticRegression())\n", "])\n", "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_lr.fit(X, y)\n", "best_params_lr = grid_search_lr.best_params_\n", "best_score_lr = grid_search_lr.best_score_\n", "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", "\n", "# Support Vector Machine\n", "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", "pipeline_svm = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', SVC())\n", "])\n", "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_svm.fit(X, y)\n", "best_params_svm = grid_search_svm.best_params_\n", "best_score_svm = grid_search_svm.best_score_\n", "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", "\n", "# Multinomial Naive Bayes\n", "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", "pipeline_nb = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', MultinomialNB())\n", "])\n", "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_nb.fit(X, y)\n", "best_params_nb = grid_search_nb.best_params_\n", "best_score_nb = grid_search_nb.best_score_\n", "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", "\n", "# Display results\n", "display_results(results)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 482 }, "id": "rWMIvJ3JJUpT", "outputId": "0ab4308e-3d57-47af-aaca-534e68fe43a6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", "Tuning hyperparameters for Random Forest:\n", "Tuning hyperparameters for Logistic Regression:\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Tuning hyperparameters for Support Vector Machine:\n", "Tuning hyperparameters for Multinomial Naive Bayes:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " Model Best Parameters \\\n", "0 Random Forest {'classifier__max_depth': None, 'classifier__n... \n", "1 Logistic Regression {'classifier__C': 10} \n", "2 Support Vector Machine {'classifier__C': 10, 'classifier__gamma': 'sc... \n", "3 Multinomial Naive Bayes {'classifier__alpha': 0.1} \n", "\n", " Best Cross-validation Accuracy \n", "0 0.6981 \n", "1 0.7055 \n", "2 0.7145 \n", "3 0.6957 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelBest ParametersBest Cross-validation Accuracy
0Random Forest{'classifier__max_depth': None, 'classifier__n...0.6981
1Logistic Regression{'classifier__C': 10}0.7055
2Support Vector Machine{'classifier__C': 10, 'classifier__gamma': 'sc...0.7145
3Multinomial Naive Bayes{'classifier__alpha': 0.1}0.6957
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display_results(results)\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"Model\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Logistic Regression\",\n \"Multinomial Naive Bayes\",\n \"Random Forest\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Parameters\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Cross-validation Accuracy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.008465419855703142,\n \"min\": 0.6957,\n \"max\": 0.7144999999999999,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.7055,\n 0.6957,\n 0.6980999999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "#using model from hugging face\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "import time\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "\n", "from transformers import pipeline\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# Read the CSV file using Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "print(df.head()) # Display the data\n", "\n", "# Preprocess the text column\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Initialize sentiment analysis pipeline\n", "sentiment_pipeline = pipeline(\"text-classification\", model=\"j-hartmann/sentiment-roberta-large-english-3-classes\")\n", "\n", "# Apply sentiment analysis to the text column\n", "sentiment_predictions = sentiment_pipeline(df['tweet'].tolist())\n", "\n", "# Add the sentiment predictions to the DataFrame\n", "df['sentiment'] = [sent['label'] for sent in sentiment_predictions]\n", "\n", "# Split data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Define hyperparameters grid for Random Forest\n", "param_grid_rf = {\n", " 'classifier__n_estimators': [50, 100, 200],\n", " 'classifier__max_depth': [None, 10, 20]\n", "}\n", "\n", "# Define hyperparameters grid for Logistic Regression\n", "param_grid_lr = {\n", " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", "}\n", "\n", "# Define hyperparameters grid for Support Vector Machine\n", "param_grid_svm = {\n", " 'classifier__C': [0.1, 1, 10],\n", " 'classifier__gamma': ['scale', 'auto']\n", "}\n", "\n", "# Define hyperparameters grid for Multinomial Naive Bayes\n", "param_grid_nb = {\n", " 'classifier__alpha': [0.1, 0.5, 1.0]\n", "}\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Function to display results in a DataFrame\n", "def display_results(results):\n", " df_results = pd.DataFrame(results)\n", " display(df_results)\n", "\n", "# Initialize an empty list to store results\n", "results = []\n", "\n", "# Random Forest\n", "print(\"Tuning hyperparameters for Random Forest:\")\n", "pipeline_rf = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())\n", "])\n", "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_rf.fit(X, y)\n", "best_params_rf = grid_search_rf.best_params_\n", "best_score_rf = grid_search_rf.best_score_\n", "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", "\n", "# Logistic Regression\n", "print(\"Tuning hyperparameters for Logistic Regression:\")\n", "pipeline_lr = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', LogisticRegression())\n", "])\n", "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_lr.fit(X, y)\n", "best_params_lr = grid_search_lr.best_params_\n", "best_score_lr = grid_search_lr.best_score_\n", "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", "\n", "# Support Vector Machine\n", "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", "pipeline_svm = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', SVC())\n", "])\n", "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_svm.fit(X, y)\n", "best_params_svm = grid_search_svm.best_params_\n", "best_score_svm = grid_search_svm.best_score_\n", "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", "\n", "# Multinomial Naive Bayes\n", "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", "pipeline_nb = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', MultinomialNB())\n", "])\n", "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_nb.fit(X, y)\n", "best_params_nb = grid_search_nb.best_params_\n", "best_score_nb = grid_search_nb.best_score_\n", "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", "\n", "# Display results\n", "display_results(results)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 984, "referenced_widgets": [ "f7b7943b77ec48709ab2c14ff9c7d8d0", "8ccb497ccc254b4fbb83a6075d999c98", "462e849b058049eb90bdec861c4c6733", "433354ec1982417b8e182d982451cff8", "79e9f2981ea4421a864c14f0c3c47c88", "1f7ddf0908aa4a4bbf20c0a98c534802", "a62aa5c3bacd439b88b66448048054de", "29f23229640c4a3b978c630bed86b0a0", "65e51c3b78c14a1db0b8b730534aa5b9", "108e829acb474ad7b9b5d4576fbfd340", "7f94536dab254a8ebbce57f65a99511d", "e2dc2d66fe9d4c068b7400aa68166d82", "c4fdbaab18b04bd092bc8327dc9384f9", "76d175d827dc479da41cf4d195176a73", "02d1a242c3c346f9ba00d4ef54632080", "3afe49394a7b4e63abbfa6eea620054b", "5cc5407d5562422b8b4306b9b4540ab8", "2bbd4694331d42b69a279571576deb51", "23c88c6f258a402288b6ffe041f61f0b", "8c75ac06d39f4bf58c4b2cdf513c47a1", "ac9cf05fb66c4b28beebe6ce874a296b", "5180b588b5f546a1a5c1b99842ad9bc4", "536c41525a424101a64e8c3c7fe6a1a7", "c95bb7e9112f49bbb4b266fe7462586b", "3d53fbf327a74eccbdeb2bdd7e0e9ea6", "f15764b0fbcc470da3735693598b03d0", "6b9d84472ca14e869b0eedfd5faa853e", "e7c76fe4370b4674a6ada68197e5b6e1", "521b25ffe3e6414d918d9c38dc0f6251", "49e97c7601b1484cb32de9a14b66a80e", "a60aa23254dc402397580e8cfa14dde2", "97016178d3b249398e3bbbd6902a1f45", "e4203f38f1e34e9baa953248c5d04867", "55fee200080e4ffa842c7b3ccba6202c", "681e215549a64a2d86046c20f943accb", "585daf7d5699401284c3fe9991cfbd1d", "0754f254add24b1bbac53908ace0674e", "1c5dd19c1b994e20b6286f921d6e3a08", "c05c707a897242cebb1844ba012ba823", "3d05b61f2061429095a1a2d4dc7500dd", "a8e4bf0b21184ccf9951bdb9114fd10e", "96934e1a081e480e9608f355cca41566", "68f47edfbd5d4bacaa3e6424c98f8961", "ffa81a80e84546148817330dd5c9eefa", "c9d0683b6fb64df1bc370de4011fe77f", "d1dc02d934cc417b983dc13d80b22d31", "cad51788127f440393bea3c2d165f42e", "2eaee4ea34a647259f015fbd9190e821", "710c585a886d4d17bd66ebef545b3371", "f2130c9ea479452699ce1389a74097e6", "f7db554186144eb0807f71fc142580fa", "0c61a26cae2d4e2f851be13f51ee78c3", "ec49ebfb9b16430181b4738b01844692", "b6f81ce13f774cc1a42ec659d50bfca3", "ffc2e024df9241de9d7c22269a47fe96", "c7581989b1d84bc5b453551440671767", "f841b17df29b42799c740b305f44c3f5", "fb3820553c8246f8a3f19d11f2929e26", "9a0be6bf09a943dba3582e78661b64ce", "97881b93a47e470eb9673ef97a28d741", "696a8fb7186e4ad9bed904d79cc5da17", "fb483bee70b04cd1bc2cef99941dfbe8", "dc610f57bb0d42a4970fd56e2beafd3e", "77f9434bf9984bbf9d7e0b409f58c236", "1b7e9249af884ecd89bdfe4da708ac38", "c3e369bcece54aa4bb6baba02f6c4c30" ] }, "id": "gTDYJRFtQ0Xm", "outputId": "637b27dc-3121-4534-84a1-1ee31e97a5ab" }, "execution_count": null, "outputs": [ { "metadata": { "tags": null }, "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" ] }, { "metadata": { "tags": null }, "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n", " tweet label\n", "0 BofA previews Netflixs NFLX Q3 Earnings Tues 0... 0\n", "1 I scooped a couple of shares this morning at a... 0\n", "2 Im streaming ES Futures using Bookmap on youtu... 0\n", "3 CF taking some off here close to 19150 1\n", "4 No change to this position is still bullish st... 0\n" ] }, { "metadata": { "tags": null }, "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f7b7943b77ec48709ab2c14ff9c7d8d0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/725 [00:00\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelBest ParametersBest Cross-validation Accuracy
0Random Forest{'classifier__max_depth': None, 'classifier__n...0.6981
1Logistic Regression{'classifier__C': 10}0.7055
2Support Vector Machine{'classifier__C': 10, 'classifier__gamma': 'sc...0.7145
3Multinomial Naive Bayes{'classifier__alpha': 0.1}0.6957
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display_results(results)\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"Model\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Logistic Regression\",\n \"Multinomial Naive Bayes\",\n \"Random Forest\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Parameters\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Cross-validation Accuracy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.008465419855703142,\n \"min\": 0.6957,\n \"max\": 0.7144999999999999,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.7055,\n 0.6957,\n 0.6980999999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "#تعديل على الكود #using model from hugging face\n", "\n", "# Libraries\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "import time\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "\n", "from transformers import pipeline\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# Read CSV file\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "print(df.head()) # Display the data\n", "\n", "# Preprocess text function\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Split data into features and target (avoiding data leakage)\n", "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Function to display results in a DataFrame\n", "def display_results(results):\n", " df_results = pd.DataFrame(results)\n", " print(df_results)\n", "\n", "# Initialize an empty list to store results\n", "results = []\n", "\n", "# Define hyperparameters grid for Random Forest\n", "param_grid_rf = {\n", " 'classifier__n_estimators': [50, 100, 200],\n", " 'classifier__max_depth': [None, 10, 20]\n", "}\n", "\n", "# Define hyperparameters grid for Logistic Regression\n", "param_grid_lr = {\n", " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", "}\n", "\n", "# Define hyperparameters grid for Support Vector Machine\n", "param_grid_svm = {\n", " 'classifier__C': [0.1, 1, 10],\n", " 'classifier__gamma': ['scale', 'auto']\n", "}\n", "\n", "# Define hyperparameters grid for Multinomial Naive Bayes\n", "param_grid_nb = {\n", " 'classifier__alpha': [0.1, 0.5, 1.0]\n", "}\n", "\n", "# Random Forest\n", "print(\"Tuning hyperparameters for Random Forest:\")\n", "pipeline_rf = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', RandomForestClassifier())\n", "])\n", "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_rf.fit(X_train, y_train)\n", "best_params_rf = grid_search_rf.best_params_\n", "best_score_rf = grid_search_rf.best_score_\n", "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", "\n", "# Logistic Regression\n", "print(\"Tuning hyperparameters for Logistic Regression:\")\n", "pipeline_lr = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', LogisticRegression())\n", "])\n", "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_lr.fit(X_train, y_train)\n", "best_params_lr = grid_search_lr.best_params_\n", "best_score_lr = grid_search_lr.best_score_\n", "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", "\n", "# Support Vector Machine\n", "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", "pipeline_svm = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', SVC())\n", "])\n", "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_svm.fit(X_train, y_train)\n", "best_params_svm = grid_search_svm.best_params_\n", "best_score_svm = grid_search_svm.best_score_\n", "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", "\n", "# Multinomial Naive Bayes\n", "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", "pipeline_nb = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', MultinomialNB())\n", "])\n", "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_nb.fit(X_train, y_train)\n", "best_params_nb = grid_search_nb.best_params_\n", "best_score_nb = grid_search_nb.best_score_\n", "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", "\n", "# Display results\n", "display_results(results)\n", "\n", "# Evaluate on test data (including additional metrics)\n", "print(\"Performance on Test Set:\")\n", "for model_name, model in [('Random Forest', grid_search_rf), ('Logistic Regression', grid_search_lr), ('Support Vector Machine', grid_search_svm), ('Multinomial Naive Bayes', grid_search_nb)]:\n", " y_pred = model.predict(X_test)\n", " print(f\"Model: {model_name}\")\n", " print(classification_report(y_test, y_pred))\n", " print(confusion_matrix(y_test, y_pred))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "D1PMZ1g5ts_Z", "outputId": "5044b9c4-7106-4ffa-f455-1d383e96bea8" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", " tweet label\n", "0 BofA previews Netflixs NFLX Q3 Earnings Tues 0... 0\n", "1 I scooped a couple of shares this morning at a... 0\n", "2 Im streaming ES Futures using Bookmap on youtu... 0\n", "3 CF taking some off here close to 19150 1\n", "4 No change to this position is still bullish st... 0\n", "Tuning hyperparameters for Random Forest:\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", " pid = os.fork()\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Tuning hyperparameters for Logistic Regression:\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Tuning hyperparameters for Support Vector Machine:\n", "Tuning hyperparameters for Multinomial Naive Bayes:\n", " Model Best Parameters \\\n", "0 Random Forest {'classifier__max_depth': None, 'classifier__n... \n", "1 Logistic Regression {'classifier__C': 10} \n", "2 Support Vector Machine {'classifier__C': 10, 'classifier__gamma': 'sc... \n", "3 Multinomial Naive Bayes {'classifier__alpha': 0.1} \n", "\n", " Best Cross-validation Accuracy \n", "0 0.691000 \n", "1 0.698375 \n", "2 0.707000 \n", "3 0.689750 \n", "Performance on Test Set:\n", "Model: Random Forest\n", " precision recall f1-score support\n", "\n", " -1 0.70 0.51 0.59 512\n", " 0 0.68 0.86 0.76 985\n", " 1 0.78 0.58 0.67 503\n", "\n", " accuracy 0.70 2000\n", " macro avg 0.72 0.65 0.67 2000\n", "weighted avg 0.71 0.70 0.69 2000\n", "\n", "[[260 218 34]\n", " [ 90 847 48]\n", " [ 22 188 293]]\n", "Model: Logistic Regression\n", " precision recall f1-score support\n", "\n", " -1 0.68 0.57 0.62 512\n", " 0 0.70 0.79 0.74 985\n", " 1 0.69 0.62 0.66 503\n", "\n", " accuracy 0.69 2000\n", " macro avg 0.69 0.66 0.67 2000\n", "weighted avg 0.69 0.69 0.69 2000\n", "\n", "[[294 174 44]\n", " [113 777 95]\n", " [ 25 164 314]]\n", "Model: Support Vector Machine\n", " precision recall f1-score support\n", "\n", " -1 0.70 0.56 0.62 512\n", " 0 0.68 0.84 0.75 985\n", " 1 0.77 0.57 0.66 503\n", "\n", " accuracy 0.70 2000\n", " macro avg 0.72 0.66 0.68 2000\n", "weighted avg 0.71 0.70 0.69 2000\n", "\n", "[[285 200 27]\n", " [ 99 827 59]\n", " [ 22 194 287]]\n", "Model: Multinomial Naive Bayes\n", " precision recall f1-score support\n", "\n", " -1 0.67 0.59 0.63 512\n", " 0 0.68 0.80 0.74 985\n", " 1 0.71 0.55 0.62 503\n", "\n", " accuracy 0.69 2000\n", " macro avg 0.69 0.65 0.66 2000\n", "weighted avg 0.69 0.69 0.68 2000\n", "\n", "[[300 175 37]\n", " [117 792 76]\n", " [ 28 196 279]]\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# قراءة ملف CSV باستخدام Pandas\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "\n", "# Preprocess text column\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub('@[^\\s]+', ' ', text)\n", " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", " text = re.sub('[0-9]+', '', text)\n", " text = \" \".join(str(text).split())\n", " text = [w for w in text.split() if w not in stopwords.words('english')]\n", " text = \" \".join(text)\n", " text = WordNetLemmatizer().lemmatize(text)\n", " return text\n", "\n", "df['tweet'] = df['tweet'].apply(preprocess_text)\n", "\n", "# Split data into features and target\n", "X = df['tweet']\n", "y = df['label']\n", "\n", "# Initialize RandomForestClassifier\n", "rf = RandomForestClassifier()\n", "\n", "# Define hyperparameters grid\n", "param_grid = {\n", " 'rf__n_estimators': [50, 100, 200],\n", " 'rf__max_depth': [None, 10, 20],\n", "}\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Define pipeline\n", "pipeline = Pipeline([\n", " ('vectorizer', TfidfVectorizer()),\n", " ('rf', rf)\n", "])\n", "\n", "# Perform hyperparameter tuning\n", "grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search.fit(X, y)\n", "\n", "# Get best model and parameters\n", "best_model = grid_search.best_estimator_\n", "best_params = grid_search.best_params_\n", "best_score = grid_search.best_score_\n", "\n", "print(\"Best parameters:\", best_params)\n", "print(\"Best cross-validation accuracy:\", best_score)\n", "\n", "# Fit the best model\n", "best_model.fit(X, y)\n", "\n", "# Predict using the best model\n", "y_pred = best_model.predict(X)\n", "\n", "# Calculate accuracy\n", "accuracy = accuracy_score(y, y_pred)\n", "print(\"Final Accuracy:\", accuracy)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UhM3zb7S-XI6", "outputId": "1934f8c1-a924-40d9-e840-f3c714e54e83" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", "Best parameters: {'rf__max_depth': None, 'rf__n_estimators': 200}\n", "Best cross-validation accuracy: 0.6987\n", "Final Accuracy: 0.9955\n" ] } ] }, { "cell_type": "code", "source": [ "#بدون معالجة\n", "# Libraries\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "import time\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.pipeline import Pipeline\n", "\n", "nltk.download('stopwords')\n", "\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "# Read CSV file\n", "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", "print(df.head()) # Display the data\n", "\n", "# Split data into features and target (avoiding data leakage)\n", "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)\n", "\n", "# Initialize KFold cross-validator\n", "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "# Function to display results in a DataFrame\n", "def display_results(results):\n", " df_results = pd.DataFrame(results)\n", " print(df_results)\n", "\n", "# Initialize an empty list to store results\n", "results = []\n", "\n", "# Define hyperparameters grid for Random Forest\n", "param_grid_rf = {\n", " 'classifier__n_estimators': [50, 100, 200],\n", " 'classifier__max_depth': [None, 10, 20]\n", "}\n", "\n", "# Define hyperparameters grid for Logistic Regression\n", "param_grid_lr = {\n", " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", "}\n", "\n", "# Define hyperparameters grid for Support Vector Machine\n", "param_grid_svm = {\n", " 'classifier__C': [0.1, 1, 10],\n", " 'classifier__gamma': ['scale', 'auto']\n", "}\n", "\n", "# Define hyperparameters grid for Multinomial Naive Bayes\n", "param_grid_nb = {\n", " 'classifier__alpha': [0.1, 0.5, 1.0]\n", "}\n", "\n", "# Random Forest\n", "print(\"Tuning hyperparameters for Random Forest:\")\n", "pipeline_rf = Pipeline([\n", " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", " ('classifier', RandomForestClassifier())\n", "])\n", "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_rf.fit(X_train, y_train)\n", "best_params_rf = grid_search_rf.best_params_\n", "best_score_rf = grid_search_rf.best_score_\n", "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", "\n", "# Logistic Regression\n", "print(\"Tuning hyperparameters for Logistic Regression:\")\n", "pipeline_lr = Pipeline([\n", " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", " ('classifier', LogisticRegression())\n", "])\n", "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_lr.fit(X_train, y_train)\n", "best_params_lr = grid_search_lr.best_params_\n", "best_score_lr = grid_search_lr.best_score_\n", "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", "\n", "# Support Vector Machine\n", "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", "pipeline_svm = Pipeline([\n", " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", " ('classifier', SVC())\n", "])\n", "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_svm.fit(X_train, y_train)\n", "best_params_svm = grid_search_svm.best_params_\n", "best_score_svm = grid_search_svm.best_score_\n", "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", "\n", "# Multinomial Naive Bayes\n", "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", "pipeline_nb = Pipeline([\n", " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", " ('classifier', MultinomialNB())\n", "])\n", "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", "grid_search_nb.fit(X_train, y_train)\n", "best_params_nb = grid_search_nb.best_params_\n", "best_score_nb = grid_search_nb.best_score_\n", "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", "\n", "# Display results\n", "display_results(results)\n", "\n", "# Evaluate on test data (including additional metrics)\n", "print(\"Performance on Test Set:\")\n", "for model_name, model in [('Random Forest', grid_search_rf), ('Logistic Regression', grid_search_lr), ('Support Vector Machine', grid_search_svm), ('Multinomial Naive Bayes', grid_search_nb)]:\n", " y_pred = model.predict(X_test)\n", " print(f\"Model: {model_name}\")\n", " print(classification_report(y_test, y_pred))\n", " print(confusion_matrix(y_test, y_pred))\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ABQvR4RLKbwq", "outputId": "3a5fdc25-85d3-43c1-b69d-6ae95fdce6bb" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", " tweet label\n", "0 BofA previews Netflixs NFLX Q3 Earnings Tues 0... 0\n", "1 I scooped a couple of shares this morning at a... 0\n", "2 Im streaming ES Futures using Bookmap on youtu... 0\n", "3 CF taking some off here close to 19150 1\n", "4 No change to this position is still bullish st... 0\n", "Tuning hyperparameters for Random Forest:\n", "Tuning hyperparameters for Logistic Regression:\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Tuning hyperparameters for Support Vector Machine:\n", "Tuning hyperparameters for Multinomial Naive Bayes:\n", " Model Best Parameters \\\n", "0 Random Forest {'classifier__max_depth': None, 'classifier__n... \n", "1 Logistic Regression {'classifier__C': 10} \n", "2 Support Vector Machine {'classifier__C': 10, 'classifier__gamma': 'sc... \n", "3 Multinomial Naive Bayes {'classifier__alpha': 0.1} \n", "\n", " Best Cross-validation Accuracy \n", "0 0.686875 \n", "1 0.700625 \n", "2 0.705000 \n", "3 0.695625 \n", "Performance on Test Set:\n", "Model: Random Forest\n", " precision recall f1-score support\n", "\n", " -1 0.71 0.51 0.59 512\n", " 0 0.68 0.87 0.76 985\n", " 1 0.76 0.56 0.64 503\n", "\n", " accuracy 0.70 2000\n", " macro avg 0.72 0.65 0.67 2000\n", "weighted avg 0.71 0.70 0.69 2000\n", "\n", "[[259 209 44]\n", " [ 81 860 44]\n", " [ 23 200 280]]\n", "Model: Logistic Regression\n", " precision recall f1-score support\n", "\n", " -1 0.68 0.59 0.63 512\n", " 0 0.70 0.79 0.74 985\n", " 1 0.70 0.63 0.66 503\n", "\n", " accuracy 0.70 2000\n", " macro avg 0.69 0.67 0.68 2000\n", "weighted avg 0.70 0.70 0.69 2000\n", "\n", "[[303 170 39]\n", " [115 774 96]\n", " [ 29 159 315]]\n", "Model: Support Vector Machine\n", " precision recall f1-score support\n", "\n", " -1 0.70 0.55 0.61 512\n", " 0 0.68 0.84 0.75 985\n", " 1 0.78 0.58 0.66 503\n", "\n", " accuracy 0.70 2000\n", " macro avg 0.72 0.65 0.67 2000\n", "weighted avg 0.71 0.70 0.69 2000\n", "\n", "[[281 206 25]\n", " [100 826 59]\n", " [ 23 190 290]]\n", "Model: Multinomial Naive Bayes\n", " precision recall f1-score support\n", "\n", " -1 0.66 0.61 0.63 512\n", " 0 0.69 0.79 0.73 985\n", " 1 0.69 0.55 0.62 503\n", "\n", " accuracy 0.68 2000\n", " macro avg 0.68 0.65 0.66 2000\n", "weighted avg 0.68 0.68 0.68 2000\n", "\n", "[[311 159 42]\n", " [128 776 81]\n", " [ 32 192 279]]\n" ] } ] } ] }