diff --git "a/MachineLearning_models.ipynb" "b/MachineLearning_models.ipynb" new file mode 100644--- /dev/null +++ "b/MachineLearning_models.ipynb" @@ -0,0 +1,4629 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "f7b7943b77ec48709ab2c14ff9c7d8d0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8ccb497ccc254b4fbb83a6075d999c98", + "IPY_MODEL_462e849b058049eb90bdec861c4c6733", + "IPY_MODEL_433354ec1982417b8e182d982451cff8" + ], + "layout": "IPY_MODEL_79e9f2981ea4421a864c14f0c3c47c88" + } + }, + "8ccb497ccc254b4fbb83a6075d999c98": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1f7ddf0908aa4a4bbf20c0a98c534802", + "placeholder": "​", + "style": "IPY_MODEL_a62aa5c3bacd439b88b66448048054de", + "value": "config.json: 100%" + } + }, + "462e849b058049eb90bdec861c4c6733": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_29f23229640c4a3b978c630bed86b0a0", + "max": 725, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_65e51c3b78c14a1db0b8b730534aa5b9", + "value": 725 + } + }, + "433354ec1982417b8e182d982451cff8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_108e829acb474ad7b9b5d4576fbfd340", + "placeholder": "​", + "style": "IPY_MODEL_7f94536dab254a8ebbce57f65a99511d", + "value": " 725/725 [00:00<00:00, 30.0kB/s]" + } + }, + "79e9f2981ea4421a864c14f0c3c47c88": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f7ddf0908aa4a4bbf20c0a98c534802": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a62aa5c3bacd439b88b66448048054de": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "29f23229640c4a3b978c630bed86b0a0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "65e51c3b78c14a1db0b8b730534aa5b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "108e829acb474ad7b9b5d4576fbfd340": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7f94536dab254a8ebbce57f65a99511d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e2dc2d66fe9d4c068b7400aa68166d82": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c4fdbaab18b04bd092bc8327dc9384f9", + "IPY_MODEL_76d175d827dc479da41cf4d195176a73", + "IPY_MODEL_02d1a242c3c346f9ba00d4ef54632080" + ], + "layout": "IPY_MODEL_3afe49394a7b4e63abbfa6eea620054b" + } + }, + "c4fdbaab18b04bd092bc8327dc9384f9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5cc5407d5562422b8b4306b9b4540ab8", + "placeholder": "​", + "style": "IPY_MODEL_2bbd4694331d42b69a279571576deb51", + "value": "pytorch_model.bin: 100%" + } + }, + "76d175d827dc479da41cf4d195176a73": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_23c88c6f258a402288b6ffe041f61f0b", + "max": 1425820242, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8c75ac06d39f4bf58c4b2cdf513c47a1", + "value": 1425820242 + } + }, + "02d1a242c3c346f9ba00d4ef54632080": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ac9cf05fb66c4b28beebe6ce874a296b", + "placeholder": "​", + "style": "IPY_MODEL_5180b588b5f546a1a5c1b99842ad9bc4", + "value": " 1.43G/1.43G [00:13<00:00, 119MB/s]" + } + }, + "3afe49394a7b4e63abbfa6eea620054b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5cc5407d5562422b8b4306b9b4540ab8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2bbd4694331d42b69a279571576deb51": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "23c88c6f258a402288b6ffe041f61f0b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8c75ac06d39f4bf58c4b2cdf513c47a1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ac9cf05fb66c4b28beebe6ce874a296b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5180b588b5f546a1a5c1b99842ad9bc4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "536c41525a424101a64e8c3c7fe6a1a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c95bb7e9112f49bbb4b266fe7462586b", + "IPY_MODEL_3d53fbf327a74eccbdeb2bdd7e0e9ea6", + "IPY_MODEL_f15764b0fbcc470da3735693598b03d0" + ], + "layout": "IPY_MODEL_6b9d84472ca14e869b0eedfd5faa853e" + } + }, + "c95bb7e9112f49bbb4b266fe7462586b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e7c76fe4370b4674a6ada68197e5b6e1", + "placeholder": "​", + "style": "IPY_MODEL_521b25ffe3e6414d918d9c38dc0f6251", + "value": "tokenizer_config.json: 100%" + } + }, + "3d53fbf327a74eccbdeb2bdd7e0e9ea6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_49e97c7601b1484cb32de9a14b66a80e", + "max": 1070, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a60aa23254dc402397580e8cfa14dde2", + "value": 1070 + } + }, + "f15764b0fbcc470da3735693598b03d0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_97016178d3b249398e3bbbd6902a1f45", + "placeholder": "​", + "style": "IPY_MODEL_e4203f38f1e34e9baa953248c5d04867", + "value": " 1.07k/1.07k [00:00<00:00, 52.3kB/s]" + } + }, + "6b9d84472ca14e869b0eedfd5faa853e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e7c76fe4370b4674a6ada68197e5b6e1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "521b25ffe3e6414d918d9c38dc0f6251": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "49e97c7601b1484cb32de9a14b66a80e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a60aa23254dc402397580e8cfa14dde2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "97016178d3b249398e3bbbd6902a1f45": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4203f38f1e34e9baa953248c5d04867": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "55fee200080e4ffa842c7b3ccba6202c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_681e215549a64a2d86046c20f943accb", + "IPY_MODEL_585daf7d5699401284c3fe9991cfbd1d", + "IPY_MODEL_0754f254add24b1bbac53908ace0674e" + ], + "layout": "IPY_MODEL_1c5dd19c1b994e20b6286f921d6e3a08" + } + }, + "681e215549a64a2d86046c20f943accb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c05c707a897242cebb1844ba012ba823", + "placeholder": "​", + "style": "IPY_MODEL_3d05b61f2061429095a1a2d4dc7500dd", + "value": "vocab.json: 100%" + } + }, + "585daf7d5699401284c3fe9991cfbd1d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a8e4bf0b21184ccf9951bdb9114fd10e", + "max": 898822, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_96934e1a081e480e9608f355cca41566", + "value": 898822 + } + }, + "0754f254add24b1bbac53908ace0674e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68f47edfbd5d4bacaa3e6424c98f8961", + "placeholder": "​", + "style": "IPY_MODEL_ffa81a80e84546148817330dd5c9eefa", + "value": " 899k/899k [00:00<00:00, 3.38MB/s]" + } + }, + "1c5dd19c1b994e20b6286f921d6e3a08": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c05c707a897242cebb1844ba012ba823": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d05b61f2061429095a1a2d4dc7500dd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a8e4bf0b21184ccf9951bdb9114fd10e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "96934e1a081e480e9608f355cca41566": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "68f47edfbd5d4bacaa3e6424c98f8961": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ffa81a80e84546148817330dd5c9eefa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c9d0683b6fb64df1bc370de4011fe77f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d1dc02d934cc417b983dc13d80b22d31", + "IPY_MODEL_cad51788127f440393bea3c2d165f42e", + "IPY_MODEL_2eaee4ea34a647259f015fbd9190e821" + ], + "layout": "IPY_MODEL_710c585a886d4d17bd66ebef545b3371" + } + }, + "d1dc02d934cc417b983dc13d80b22d31": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f2130c9ea479452699ce1389a74097e6", + "placeholder": "​", + "style": "IPY_MODEL_f7db554186144eb0807f71fc142580fa", + "value": "merges.txt: 100%" + } + }, + "cad51788127f440393bea3c2d165f42e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0c61a26cae2d4e2f851be13f51ee78c3", + "max": 456318, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ec49ebfb9b16430181b4738b01844692", + "value": 456318 + } + }, + "2eaee4ea34a647259f015fbd9190e821": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b6f81ce13f774cc1a42ec659d50bfca3", + "placeholder": "​", + "style": "IPY_MODEL_ffc2e024df9241de9d7c22269a47fe96", + "value": " 456k/456k [00:00<00:00, 3.42MB/s]" + } + }, + "710c585a886d4d17bd66ebef545b3371": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f2130c9ea479452699ce1389a74097e6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7db554186144eb0807f71fc142580fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0c61a26cae2d4e2f851be13f51ee78c3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec49ebfb9b16430181b4738b01844692": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b6f81ce13f774cc1a42ec659d50bfca3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ffc2e024df9241de9d7c22269a47fe96": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c7581989b1d84bc5b453551440671767": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f841b17df29b42799c740b305f44c3f5", + "IPY_MODEL_fb3820553c8246f8a3f19d11f2929e26", + "IPY_MODEL_9a0be6bf09a943dba3582e78661b64ce" + ], + "layout": "IPY_MODEL_97881b93a47e470eb9673ef97a28d741" + } + }, + "f841b17df29b42799c740b305f44c3f5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_696a8fb7186e4ad9bed904d79cc5da17", + "placeholder": "​", + "style": "IPY_MODEL_fb483bee70b04cd1bc2cef99941dfbe8", + "value": "special_tokens_map.json: 100%" + } + }, + "fb3820553c8246f8a3f19d11f2929e26": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dc610f57bb0d42a4970fd56e2beafd3e", + "max": 772, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_77f9434bf9984bbf9d7e0b409f58c236", + "value": 772 + } + }, + "9a0be6bf09a943dba3582e78661b64ce": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b7e9249af884ecd89bdfe4da708ac38", + "placeholder": "​", + "style": "IPY_MODEL_c3e369bcece54aa4bb6baba02f6c4c30", + "value": " 772/772 [00:00<00:00, 35.6kB/s]" + } + }, + "97881b93a47e470eb9673ef97a28d741": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "696a8fb7186e4ad9bed904d79cc5da17": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fb483bee70b04cd1bc2cef99941dfbe8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dc610f57bb0d42a4970fd56e2beafd3e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "77f9434bf9984bbf9d7e0b409f58c236": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1b7e9249af884ecd89bdfe4da708ac38": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c3e369bcece54aa4bb6baba02f6c4c30": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "\n", + "# Preprocess text data\n", + "df['tweet'] = df['tweet'].str.lower()\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub('@[^\\s]+', ' ', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub(f'[{string.punctuation}]', ' ', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub('[0-9]+', '', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(str(x).split()))\n", + "df['tweet'] = df['tweet'].apply(lambda x: [w for w in x.split() if w not in stopwords.words('english')])\n", + "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: WordNetLemmatizer().lemmatize(x))\n", + "\n", + "# Fit the bag of words transformer to the text column\n", + "bow_transformer = CountVectorizer().fit(df['tweet'])\n", + "\n", + "# Transform the text column to bag of words representation\n", + "text_bow = bow_transformer.transform(df['tweet'])\n", + "\n", + "# Apply Tf-Idf transformer to the bag of words representation\n", + "tfidf_transformer = TfidfTransformer().fit(text_bow)\n", + "text_tfidf = tfidf_transformer.transform(text_bow)\n", + "\n", + "# Split the data into train and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(text_tfidf, df['label'], test_size=0.2, random_state=42)\n", + "\n", + "# Initialize and train the model\n", + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Predict using the test set\n", + "y_pred = model.predict(X_test)\n", + "\n", + "# Calculate accuracy\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ELI_EP93ws0o", + "outputId": "0bae90bc-d511-4888-c507-cc5908f38a82" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.677\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.model_selection import train_test_split, KFold\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from xgboost import XGBClassifier\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "\n", + "# Preprocess text data\n", + "df['tweet'] = df['tweet'].str.lower()\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub('@[^\\s]+', ' ', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub(f'[{string.punctuation}]', ' ', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: re.sub('[0-9]+', '', x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(str(x).split()))\n", + "df['tweet'] = df['tweet'].apply(lambda x: [w for w in x.split() if w not in stopwords.words('english')])\n", + "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x))\n", + "df['tweet'] = df['tweet'].apply(lambda x: WordNetLemmatizer().lemmatize(x))\n", + "\n", + "# Split the data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Initialize models\n", + "models = {\n", + " 'Logistic Regression': LogisticRegression(),\n", + " 'Random Forest': RandomForestClassifier(),\n", + " 'Support Vector Machine': SVC(),\n", + " 'Multinomial Naive Bayes': MultinomialNB(),\n", + " 'XGBoost': XGBClassifier()\n", + "}\n", + "\n", + "# Apply K-Fold cross-validation\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "for model_name, model in models.items():\n", + " print(f\"Training {model_name}:\")\n", + " accuracies = []\n", + " for train_index, test_index in kf.split(X):\n", + " X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n", + " y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n", + "\n", + " # Fit the bag of words transformer to the text column\n", + " bow_transformer = CountVectorizer().fit(X_train)\n", + " # Transform the text column to bag of words representation\n", + " text_bow_train = bow_transformer.transform(X_train)\n", + " text_bow_test = bow_transformer.transform(X_test)\n", + "\n", + " # Apply Tf-Idf transformer to the bag of words representation\n", + " tfidf_transformer = TfidfTransformer().fit(text_bow_train)\n", + " text_tfidf_train = tfidf_transformer.transform(text_bow_train)\n", + " text_tfidf_test = tfidf_transformer.transform(text_bow_test)\n", + "\n", + " # Train the model\n", + " model.fit(text_tfidf_train, y_train)\n", + "\n", + " # Predict using the test set\n", + " y_pred = model.predict(text_tfidf_test)\n", + "\n", + " # Calculate accuracy\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " accuracies.append(accuracy)\n", + " print(f\" - Fold accuracy: {accuracy}\")\n", + "\n", + " # Average accuracy across all folds\n", + " avg_accuracy = np.mean(accuracies)\n", + " print(f\"{model_name} average accuracy: {avg_accuracy}\\n\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 369 + }, + "id": "mo68I9qZ2zqB", + "outputId": "3bff4613-0bfd-4d46-dd0c-3b70b976488c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "error", + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/content/drive/MyDrive/train (1).csv'", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;31m# Read the CSV file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive/MyDrive/train (1).csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;31m# Preprocess text data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 910\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 912\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 913\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 914\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 576\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 577\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 578\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1405\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1406\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1407\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1409\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1660\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1661\u001b[0;31m self.handles = get_handle(\n\u001b[0m\u001b[1;32m 1662\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1663\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m handle = open(\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/drive/MyDrive/train (1).csv'" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "\n", + "# Preprocess text column\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Split data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Initialize models\n", + "models = {\n", + " 'Random Forest': RandomForestClassifier(),\n", + " 'Logistic Regression': LogisticRegression(),\n", + " 'Support Vector Machine': SVC(),\n", + " 'Multinomial Naive Bayes': MultinomialNB()\n", + "}\n", + "\n", + "# Define hyperparameters grid for each model\n", + "param_grid = {\n", + " 'Random Forest': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20]},\n", + " 'Logistic Regression': {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},\n", + " 'Support Vector Machine': {'classifier__C': [0.1, 1, 10], 'classifier__gamma': ['scale', 'auto']},\n", + " 'Multinomial Naive Bayes': {'classifier__alpha': [0.1, 0.5, 1.0]},\n", + "}\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Perform hyperparameter tuning for each model\n", + "for model_name, model in models.items():\n", + " print(f\"Tuning hyperparameters for {model_name}:\")\n", + "\n", + " # Define pipeline with TfidfVectorizer and model\n", + " pipeline = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', model)\n", + " ])\n", + "\n", + " # Perform grid search\n", + " grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=kf, scoring='accuracy', n_jobs=-1)\n", + " grid_search.fit(X, y)\n", + "\n", + " # Print best parameters and best cross-validation accuracy\n", + " best_params = grid_search.best_params_\n", + " best_score = grid_search.best_score_\n", + " print(f\"Best parameters: {best_params}\")\n", + " print(f\"Best cross-validation accuracy: {best_score}\\n\")\n", + "\n", + "# Choose the best model\n", + "best_model_name = max(models, key=lambda k: grid_search.cv_results_['mean_test_score'][np.argwhere(grid_search.cv_results_['rank_test_score'] == 1)[0][0]])\n", + "best_model = grid_search.best_estimator_\n", + "\n", + "# Fit the best model\n", + "best_model.fit(X, y)\n", + "\n", + "# Predict using the best model\n", + "y_pred = best_model.predict(X)\n", + "\n", + "# Calculate accuracy\n", + "accuracy = accuracy_score(y, y_pred)\n", + "print(\"Final Accuracy:\", accuracy)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YxlqbMM74t1m", + "outputId": "ecde4844-a8bd-42f9-ff3b-b88599c5c0cc" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tuning hyperparameters for Random Forest:\n", + "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}\n", + "Best cross-validation accuracy: 0.6992999999999999\n", + "\n", + "Tuning hyperparameters for Logistic Regression:\n", + "Best parameters: {'classifier__C': 10}\n", + "Best cross-validation accuracy: 0.7055\n", + "\n", + "Tuning hyperparameters for Support Vector Machine:\n", + "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n", + "Best cross-validation accuracy: 0.7144999999999999\n", + "\n", + "Tuning hyperparameters for Multinomial Naive Bayes:\n", + "Best parameters: {'classifier__alpha': 0.1}\n", + "Best cross-validation accuracy: 0.6957\n", + "\n", + "Final Accuracy: 0.9004\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#فصل اعدادات كل موديل\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "\n", + "# Preprocess text column\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Split data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Define hyperparameters grid for Random Forest\n", + "param_grid_rf = {\n", + " 'classifier__n_estimators': [50, 100, 200],\n", + " 'classifier__max_depth': [None, 10, 20]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Support Vector Machine\n", + "param_grid_svm = {\n", + " 'classifier__C': [0.1, 1, 10],\n", + " 'classifier__gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Define hyperparameters grid for Multinomial Naive Bayes\n", + "param_grid_nb = {\n", + " 'classifier__alpha': [0.1, 0.5, 1.0]\n", + "}\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Random Forest\n", + "print(\"Tuning hyperparameters for Random Forest:\")\n", + "pipeline_rf = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', RandomForestClassifier())\n", + "])\n", + "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_rf.fit(X, y)\n", + "best_params_rf = grid_search_rf.best_params_\n", + "best_score_rf = grid_search_rf.best_score_\n", + "print(f\"Best parameters: {best_params_rf}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_rf}\\n\")\n", + "\n", + "# Logistic Regression\n", + "print(\"Tuning hyperparameters for Logistic Regression:\")\n", + "pipeline_lr = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', LogisticRegression())\n", + "])\n", + "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_lr.fit(X, y)\n", + "best_params_lr = grid_search_lr.best_params_\n", + "best_score_lr = grid_search_lr.best_score_\n", + "print(f\"Best parameters: {best_params_lr}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_lr}\\n\")\n", + "\n", + "# Support Vector Machine\n", + "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", + "pipeline_svm = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', SVC())\n", + "])\n", + "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_svm.fit(X, y)\n", + "best_params_svm = grid_search_svm.best_params_\n", + "best_score_svm = grid_search_svm.best_score_\n", + "print(f\"Best parameters: {best_params_svm}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_svm}\\n\")\n", + "\n", + "# Multinomial Naive Bayes\n", + "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", + "pipeline_nb = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + "])\n", + "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_nb.fit(X, y)\n", + "best_params_nb = grid_search_nb.best_params_\n", + "best_score_nb = grid_search_nb.best_score_\n", + "print(f\"Best parameters: {best_params_nb}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_nb}\\n\")\n", + "\n", + "# Choose the best model\n", + "best_model_name = max({\n", + " 'Random Forest': best_score_rf,\n", + " 'Logistic Regression': best_score_lr,\n", + " 'Support Vector Machine': best_score_svm,\n", + " 'Multinomial Naive Bayes': best_score_nb\n", + "}.items(), key=lambda x: x[1])[0]\n", + "best_model = None\n", + "\n", + "if best_model_name == 'Random Forest':\n", + " best_model = grid_search_rf.best_estimator_\n", + "elif best_model_name == 'Logistic Regression':\n", + " best_model = grid_search_lr.best_estimator_\n", + "elif best_model_name == 'Support Vector Machine':\n", + " best_model = grid_search_svm.best_estimator_\n", + "elif best_model_name == 'Multinomial Naive Bayes':\n", + " best_model = grid_search_nb.best_estimator_\n", + "\n", + "# Fit the best model\n", + "if best_model:\n", + " best_model.fit(X, y)\n", + " # Predict using the best model\n", + " y_pred = best_model.predict(X)\n", + " # Calculate accuracy\n", + " accuracy = accuracy_score(y, y_pred)\n", + " print(\"Final Accuracy:\", accuracy)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eOKgOLHmFc8u", + "outputId": "93efecb0-a9c5-4497-9544-2959fa534f4d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tuning hyperparameters for Random Forest:\n", + "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}\n", + "Best cross-validation accuracy: 0.6977\n", + "\n", + "Tuning hyperparameters for Logistic Regression:\n", + "Best parameters: {'classifier__C': 10}\n", + "Best cross-validation accuracy: 0.7055\n", + "\n", + "Tuning hyperparameters for Support Vector Machine:\n", + "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n", + "Best cross-validation accuracy: 0.7144999999999999\n", + "\n", + "Tuning hyperparameters for Multinomial Naive Bayes:\n", + "Best parameters: {'classifier__alpha': 0.1}\n", + "Best cross-validation accuracy: 0.6957\n", + "\n", + "Final Accuracy: 0.9954\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "import time\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "df # إ��هار البيانات\n", + "\n", + "# Preprocess text column\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Split data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Define hyperparameters grid for Random Forest\n", + "param_grid_rf = {\n", + " 'classifier__n_estimators': [50, 100, 200],\n", + " 'classifier__max_depth': [None, 10, 20]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Support Vector Machine\n", + "param_grid_svm = {\n", + " 'classifier__C': [0.1, 1, 10],\n", + " 'classifier__gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Define hyperparameters grid for Multinomial Naive Bayes\n", + "param_grid_nb = {\n", + " 'classifier__alpha': [0.1, 0.5, 1.0]\n", + "}\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Random Forest\n", + "print(\"Tuning hyperparameters for Random Forest:\")\n", + "pipeline_rf = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', RandomForestClassifier())\n", + "])\n", + "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_rf.fit(X, y)\n", + "best_params_rf = grid_search_rf.best_params_\n", + "best_score_rf = grid_search_rf.best_score_\n", + "print(f\"Best parameters: {best_params_rf}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_rf}\\n\")\n", + "\n", + "# Logistic Regression\n", + "print(\"Tuning hyperparameters for Logistic Regression:\")\n", + "pipeline_lr = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', LogisticRegression())\n", + "])\n", + "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_lr.fit(X, y)\n", + "best_params_lr = grid_search_lr.best_params_\n", + "best_score_lr = grid_search_lr.best_score_\n", + "print(f\"Best parameters: {best_params_lr}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_lr}\\n\")\n", + "\n", + "# Support Vector Machine\n", + "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", + "pipeline_svm = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', SVC())\n", + "])\n", + "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_svm.fit(X, y)\n", + "best_params_svm = grid_search_svm.best_params_\n", + "best_score_svm = grid_search_svm.best_score_\n", + "print(f\"Best parameters: {best_params_svm}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_svm}\\n\")\n", + "\n", + "# Multinomial Naive Bayes\n", + "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", + "pipeline_nb = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + "])\n", + "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_nb.fit(X, y)\n", + "best_params_nb = grid_search_nb.best_params_\n", + "best_score_nb = grid_search_nb.best_score_\n", + "print(f\"Best parameters: {best_params_nb}\")\n", + "print(f\"Best cross-validation accuracy: {best_score_nb}\\n\")\n", + "\n", + "# Choose the best model\n", + "best_model_name = max({\n", + " 'Random Forest': best_score_rf,\n", + " 'Logistic Regression': best_score_lr,\n", + " 'Support Vector Machine': best_score_svm,\n", + " 'Multinomial Naive Bayes': best_score_nb\n", + "}.items(), key=lambda x: x[1])[0]\n", + "best_model = None\n", + "\n", + "if best_model_name == 'Random Forest':\n", + " best_model = grid_search_rf.best_estimator_\n", + "elif best_model_name == 'Logistic Regression':\n", + " best_model = grid_search_lr.best_estimator_\n", + "elif best_model_name == 'Support Vector Machine':\n", + " best_model = grid_search_svm.best_estimator_\n", + "elif best_model_name == 'Multinomial Naive Bayes':\n", + " best_model = grid_search_nb.best_estimator_\n", + "\n", + "# Fit the best model\n", + "if best_model:\n", + " best_model.fit(X, y)\n", + " # Predict using the best model\n", + " y_pred = best_model.predict(X)\n", + " # Calculate accuracy\n", + " accuracy = accuracy_score(y, y_pred)\n", + " print(\"Final Accuracy:\", accuracy)\n", + "\n", + " # Calculate and print confusion matrix and classification report\n", + " cm = confusion_matrix(y, y_pred)\n", + " cr = classification_report(y, y_pred)\n", + " print(\"Confusion Matrix:\")\n", + " print(cm)\n", + " print(\"Classification Report:\")\n", + " print(cr)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xOXg8UEbC-kj", + "outputId": "409ac90e-a68d-451b-b299-b3322f54803f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + "Tuning hyperparameters for Random Forest:\n", + "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}\n", + "Best cross-validation accuracy: 0.6995999999999999\n", + "\n", + "Tuning hyperparameters for Logistic Regression:\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Best parameters: {'classifier__C': 10}\n", + "Best cross-validation accuracy: 0.7055\n", + "\n", + "Tuning hyperparameters for Support Vector Machine:\n", + "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n", + "Best cross-validation accuracy: 0.7144999999999999\n", + "\n", + "Tuning hyperparameters for Multinomial Naive Bayes:\n", + "Best parameters: {'classifier__alpha': 0.1}\n", + "Best cross-validation accuracy: 0.6957\n", + "\n", + "Final Accuracy: 0.9954\n", + "Confusion Matrix:\n", + "[[2423 8 1]\n", + " [ 4 4966 12]\n", + " [ 0 21 2565]]\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " -1 1.00 1.00 1.00 2432\n", + " 0 0.99 1.00 1.00 4982\n", + " 1 0.99 0.99 0.99 2586\n", + "\n", + " accuracy 1.00 10000\n", + " macro avg 1.00 0.99 1.00 10000\n", + "weighted avg 1.00 1.00 1.00 10000\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#الكود الصح مبدئياً\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "import time\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "df # إظهار البيانات\n", + "\n", + "# Preprocess text column\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Split data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Define hyperparameters grid for Random Forest\n", + "param_grid_rf = {\n", + " 'classifier__n_estimators': [50, 100, 200],\n", + " 'classifier__max_depth': [None, 10, 20]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Support Vector Machine\n", + "param_grid_svm = {\n", + " 'classifier__C': [0.1, 1, 10],\n", + " 'classifier__gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Define hyperparameters grid for Multinomial Naive Bayes\n", + "param_grid_nb = {\n", + " 'classifier__alpha': [0.1, 0.5, 1.0]\n", + "}\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Function to display results in a DataFrame\n", + "def display_results(results):\n", + " df_results = pd.DataFrame(results)\n", + " display(df_results)\n", + "\n", + "# Initialize an empty list to store results\n", + "results = []\n", + "\n", + "# Random Forest\n", + "print(\"Tuning hyperparameters for Random Forest:\")\n", + "pipeline_rf = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', RandomForestClassifier())\n", + "])\n", + "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_rf.fit(X, y)\n", + "best_params_rf = grid_search_rf.best_params_\n", + "best_score_rf = grid_search_rf.best_score_\n", + "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", + "\n", + "# Logistic Regression\n", + "print(\"Tuning hyperparameters for Logistic Regression:\")\n", + "pipeline_lr = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', LogisticRegression())\n", + "])\n", + "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_lr.fit(X, y)\n", + "best_params_lr = grid_search_lr.best_params_\n", + "best_score_lr = grid_search_lr.best_score_\n", + "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", + "\n", + "# Support Vector Machine\n", + "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", + "pipeline_svm = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', SVC())\n", + "])\n", + "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_svm.fit(X, y)\n", + "best_params_svm = grid_search_svm.best_params_\n", + "best_score_svm = grid_search_svm.best_score_\n", + "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", + "\n", + "# Multinomial Naive Bayes\n", + "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", + "pipeline_nb = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + "])\n", + "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_nb.fit(X, y)\n", + "best_params_nb = grid_search_nb.best_params_\n", + "best_score_nb = grid_search_nb.best_score_\n", + "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", + "\n", + "# Display results\n", + "display_results(results)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 482 + }, + "id": "rWMIvJ3JJUpT", + "outputId": "0ab4308e-3d57-47af-aaca-534e68fe43a6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + "Tuning hyperparameters for Random Forest:\n", + "Tuning hyperparameters for Logistic Regression:\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tuning hyperparameters for Support Vector Machine:\n", + "Tuning hyperparameters for Multinomial Naive Bayes:\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " Model Best Parameters \\\n", + "0 Random Forest {'classifier__max_depth': None, 'classifier__n... \n", + "1 Logistic Regression {'classifier__C': 10} \n", + "2 Support Vector Machine {'classifier__C': 10, 'classifier__gamma': 'sc... \n", + "3 Multinomial Naive Bayes {'classifier__alpha': 0.1} \n", + "\n", + " Best Cross-validation Accuracy \n", + "0 0.6981 \n", + "1 0.7055 \n", + "2 0.7145 \n", + "3 0.6957 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelBest ParametersBest Cross-validation Accuracy
0Random Forest{'classifier__max_depth': None, 'classifier__n...0.6981
1Logistic Regression{'classifier__C': 10}0.7055
2Support Vector Machine{'classifier__C': 10, 'classifier__gamma': 'sc...0.7145
3Multinomial Naive Bayes{'classifier__alpha': 0.1}0.6957
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display_results(results)\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"Model\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Logistic Regression\",\n \"Multinomial Naive Bayes\",\n \"Random Forest\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Parameters\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Cross-validation Accuracy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.008465419855703142,\n \"min\": 0.6957,\n \"max\": 0.7144999999999999,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.7055,\n 0.6957,\n 0.6980999999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "#using model from hugging face\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "import time\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "from transformers import pipeline\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# Read the CSV file using Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "print(df.head()) # Display the data\n", + "\n", + "# Preprocess the text column\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Initialize sentiment analysis pipeline\n", + "sentiment_pipeline = pipeline(\"text-classification\", model=\"j-hartmann/sentiment-roberta-large-english-3-classes\")\n", + "\n", + "# Apply sentiment analysis to the text column\n", + "sentiment_predictions = sentiment_pipeline(df['tweet'].tolist())\n", + "\n", + "# Add the sentiment predictions to the DataFrame\n", + "df['sentiment'] = [sent['label'] for sent in sentiment_predictions]\n", + "\n", + "# Split data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Define hyperparameters grid for Random Forest\n", + "param_grid_rf = {\n", + " 'classifier__n_estimators': [50, 100, 200],\n", + " 'classifier__max_depth': [None, 10, 20]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Support Vector Machine\n", + "param_grid_svm = {\n", + " 'classifier__C': [0.1, 1, 10],\n", + " 'classifier__gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Define hyperparameters grid for Multinomial Naive Bayes\n", + "param_grid_nb = {\n", + " 'classifier__alpha': [0.1, 0.5, 1.0]\n", + "}\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Function to display results in a DataFrame\n", + "def display_results(results):\n", + " df_results = pd.DataFrame(results)\n", + " display(df_results)\n", + "\n", + "# Initialize an empty list to store results\n", + "results = []\n", + "\n", + "# Random Forest\n", + "print(\"Tuning hyperparameters for Random Forest:\")\n", + "pipeline_rf = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', RandomForestClassifier())\n", + "])\n", + "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_rf.fit(X, y)\n", + "best_params_rf = grid_search_rf.best_params_\n", + "best_score_rf = grid_search_rf.best_score_\n", + "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", + "\n", + "# Logistic Regression\n", + "print(\"Tuning hyperparameters for Logistic Regression:\")\n", + "pipeline_lr = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', LogisticRegression())\n", + "])\n", + "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_lr.fit(X, y)\n", + "best_params_lr = grid_search_lr.best_params_\n", + "best_score_lr = grid_search_lr.best_score_\n", + "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", + "\n", + "# Support Vector Machine\n", + "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", + "pipeline_svm = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', SVC())\n", + "])\n", + "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_svm.fit(X, y)\n", + "best_params_svm = grid_search_svm.best_params_\n", + "best_score_svm = grid_search_svm.best_score_\n", + "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", + "\n", + "# Multinomial Naive Bayes\n", + "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", + "pipeline_nb = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + "])\n", + "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_nb.fit(X, y)\n", + "best_params_nb = grid_search_nb.best_params_\n", + "best_score_nb = grid_search_nb.best_score_\n", + "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", + "\n", + "# Display results\n", + "display_results(results)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 984, + "referenced_widgets": [ + "f7b7943b77ec48709ab2c14ff9c7d8d0", + "8ccb497ccc254b4fbb83a6075d999c98", + "462e849b058049eb90bdec861c4c6733", + "433354ec1982417b8e182d982451cff8", + "79e9f2981ea4421a864c14f0c3c47c88", + "1f7ddf0908aa4a4bbf20c0a98c534802", + "a62aa5c3bacd439b88b66448048054de", + "29f23229640c4a3b978c630bed86b0a0", + "65e51c3b78c14a1db0b8b730534aa5b9", + "108e829acb474ad7b9b5d4576fbfd340", + "7f94536dab254a8ebbce57f65a99511d", + "e2dc2d66fe9d4c068b7400aa68166d82", + "c4fdbaab18b04bd092bc8327dc9384f9", + "76d175d827dc479da41cf4d195176a73", + "02d1a242c3c346f9ba00d4ef54632080", + "3afe49394a7b4e63abbfa6eea620054b", + "5cc5407d5562422b8b4306b9b4540ab8", + "2bbd4694331d42b69a279571576deb51", + "23c88c6f258a402288b6ffe041f61f0b", + "8c75ac06d39f4bf58c4b2cdf513c47a1", + "ac9cf05fb66c4b28beebe6ce874a296b", + "5180b588b5f546a1a5c1b99842ad9bc4", + "536c41525a424101a64e8c3c7fe6a1a7", + "c95bb7e9112f49bbb4b266fe7462586b", + "3d53fbf327a74eccbdeb2bdd7e0e9ea6", + "f15764b0fbcc470da3735693598b03d0", + "6b9d84472ca14e869b0eedfd5faa853e", + "e7c76fe4370b4674a6ada68197e5b6e1", + "521b25ffe3e6414d918d9c38dc0f6251", + "49e97c7601b1484cb32de9a14b66a80e", + "a60aa23254dc402397580e8cfa14dde2", + "97016178d3b249398e3bbbd6902a1f45", + "e4203f38f1e34e9baa953248c5d04867", + "55fee200080e4ffa842c7b3ccba6202c", + "681e215549a64a2d86046c20f943accb", + "585daf7d5699401284c3fe9991cfbd1d", + "0754f254add24b1bbac53908ace0674e", + "1c5dd19c1b994e20b6286f921d6e3a08", + "c05c707a897242cebb1844ba012ba823", + "3d05b61f2061429095a1a2d4dc7500dd", + "a8e4bf0b21184ccf9951bdb9114fd10e", + "96934e1a081e480e9608f355cca41566", + "68f47edfbd5d4bacaa3e6424c98f8961", + "ffa81a80e84546148817330dd5c9eefa", + "c9d0683b6fb64df1bc370de4011fe77f", + "d1dc02d934cc417b983dc13d80b22d31", + "cad51788127f440393bea3c2d165f42e", + "2eaee4ea34a647259f015fbd9190e821", + "710c585a886d4d17bd66ebef545b3371", + "f2130c9ea479452699ce1389a74097e6", + "f7db554186144eb0807f71fc142580fa", + "0c61a26cae2d4e2f851be13f51ee78c3", + "ec49ebfb9b16430181b4738b01844692", + "b6f81ce13f774cc1a42ec659d50bfca3", + "ffc2e024df9241de9d7c22269a47fe96", + "c7581989b1d84bc5b453551440671767", + "f841b17df29b42799c740b305f44c3f5", + "fb3820553c8246f8a3f19d11f2929e26", + "9a0be6bf09a943dba3582e78661b64ce", + "97881b93a47e470eb9673ef97a28d741", + "696a8fb7186e4ad9bed904d79cc5da17", + "fb483bee70b04cd1bc2cef99941dfbe8", + "dc610f57bb0d42a4970fd56e2beafd3e", + "77f9434bf9984bbf9d7e0b409f58c236", + "1b7e9249af884ecd89bdfe4da708ac38", + "c3e369bcece54aa4bb6baba02f6c4c30" + ] + }, + "id": "gTDYJRFtQ0Xm", + "outputId": "637b27dc-3121-4534-84a1-1ee31e97a5ab" + }, + "execution_count": null, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n", + " tweet label\n", + "0 BofA previews Netflixs NFLX Q3 Earnings Tues 0... 0\n", + "1 I scooped a couple of shares this morning at a... 0\n", + "2 Im streaming ES Futures using Bookmap on youtu... 0\n", + "3 CF taking some off here close to 19150 1\n", + "4 No change to this position is still bullish st... 0\n" + ] + }, + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f7b7943b77ec48709ab2c14ff9c7d8d0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/725 [00:00\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelBest ParametersBest Cross-validation Accuracy
0Random Forest{'classifier__max_depth': None, 'classifier__n...0.6981
1Logistic Regression{'classifier__C': 10}0.7055
2Support Vector Machine{'classifier__C': 10, 'classifier__gamma': 'sc...0.7145
3Multinomial Naive Bayes{'classifier__alpha': 0.1}0.6957
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"display_results(results)\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"Model\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Logistic Regression\",\n \"Multinomial Naive Bayes\",\n \"Random Forest\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Parameters\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Best Cross-validation Accuracy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.008465419855703142,\n \"min\": 0.6957,\n \"max\": 0.7144999999999999,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.7055,\n 0.6957,\n 0.6980999999999999\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "#تعديل على الكود #using model from hugging face\n", + "\n", + "# Libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "import time\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "from transformers import pipeline\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# Read CSV file\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "print(df.head()) # Display the data\n", + "\n", + "# Preprocess text function\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Split data into features and target (avoiding data leakage)\n", + "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Function to display results in a DataFrame\n", + "def display_results(results):\n", + " df_results = pd.DataFrame(results)\n", + " print(df_results)\n", + "\n", + "# Initialize an empty list to store results\n", + "results = []\n", + "\n", + "# Define hyperparameters grid for Random Forest\n", + "param_grid_rf = {\n", + " 'classifier__n_estimators': [50, 100, 200],\n", + " 'classifier__max_depth': [None, 10, 20]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Support Vector Machine\n", + "param_grid_svm = {\n", + " 'classifier__C': [0.1, 1, 10],\n", + " 'classifier__gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Define hyperparameters grid for Multinomial Naive Bayes\n", + "param_grid_nb = {\n", + " 'classifier__alpha': [0.1, 0.5, 1.0]\n", + "}\n", + "\n", + "# Random Forest\n", + "print(\"Tuning hyperparameters for Random Forest:\")\n", + "pipeline_rf = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', RandomForestClassifier())\n", + "])\n", + "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_rf.fit(X_train, y_train)\n", + "best_params_rf = grid_search_rf.best_params_\n", + "best_score_rf = grid_search_rf.best_score_\n", + "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", + "\n", + "# Logistic Regression\n", + "print(\"Tuning hyperparameters for Logistic Regression:\")\n", + "pipeline_lr = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', LogisticRegression())\n", + "])\n", + "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_lr.fit(X_train, y_train)\n", + "best_params_lr = grid_search_lr.best_params_\n", + "best_score_lr = grid_search_lr.best_score_\n", + "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", + "\n", + "# Support Vector Machine\n", + "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", + "pipeline_svm = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', SVC())\n", + "])\n", + "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_svm.fit(X_train, y_train)\n", + "best_params_svm = grid_search_svm.best_params_\n", + "best_score_svm = grid_search_svm.best_score_\n", + "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", + "\n", + "# Multinomial Naive Bayes\n", + "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", + "pipeline_nb = Pipeline([\n", + " ('tfidf', TfidfVectorizer()),\n", + " ('classifier', MultinomialNB())\n", + "])\n", + "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_nb.fit(X_train, y_train)\n", + "best_params_nb = grid_search_nb.best_params_\n", + "best_score_nb = grid_search_nb.best_score_\n", + "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", + "\n", + "# Display results\n", + "display_results(results)\n", + "\n", + "# Evaluate on test data (including additional metrics)\n", + "print(\"Performance on Test Set:\")\n", + "for model_name, model in [('Random Forest', grid_search_rf), ('Logistic Regression', grid_search_lr), ('Support Vector Machine', grid_search_svm), ('Multinomial Naive Bayes', grid_search_nb)]:\n", + " y_pred = model.predict(X_test)\n", + " print(f\"Model: {model_name}\")\n", + " print(classification_report(y_test, y_pred))\n", + " print(confusion_matrix(y_test, y_pred))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "D1PMZ1g5ts_Z", + "outputId": "5044b9c4-7106-4ffa-f455-1d383e96bea8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + " tweet label\n", + "0 BofA previews Netflixs NFLX Q3 Earnings Tues 0... 0\n", + "1 I scooped a couple of shares this morning at a... 0\n", + "2 Im streaming ES Futures using Bookmap on youtu... 0\n", + "3 CF taking some off here close to 19150 1\n", + "4 No change to this position is still bullish st... 0\n", + "Tuning hyperparameters for Random Forest:\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", + " pid = os.fork()\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tuning hyperparameters for Logistic Regression:\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tuning hyperparameters for Support Vector Machine:\n", + "Tuning hyperparameters for Multinomial Naive Bayes:\n", + " Model Best Parameters \\\n", + "0 Random Forest {'classifier__max_depth': None, 'classifier__n... \n", + "1 Logistic Regression {'classifier__C': 10} \n", + "2 Support Vector Machine {'classifier__C': 10, 'classifier__gamma': 'sc... \n", + "3 Multinomial Naive Bayes {'classifier__alpha': 0.1} \n", + "\n", + " Best Cross-validation Accuracy \n", + "0 0.691000 \n", + "1 0.698375 \n", + "2 0.707000 \n", + "3 0.689750 \n", + "Performance on Test Set:\n", + "Model: Random Forest\n", + " precision recall f1-score support\n", + "\n", + " -1 0.70 0.51 0.59 512\n", + " 0 0.68 0.86 0.76 985\n", + " 1 0.78 0.58 0.67 503\n", + "\n", + " accuracy 0.70 2000\n", + " macro avg 0.72 0.65 0.67 2000\n", + "weighted avg 0.71 0.70 0.69 2000\n", + "\n", + "[[260 218 34]\n", + " [ 90 847 48]\n", + " [ 22 188 293]]\n", + "Model: Logistic Regression\n", + " precision recall f1-score support\n", + "\n", + " -1 0.68 0.57 0.62 512\n", + " 0 0.70 0.79 0.74 985\n", + " 1 0.69 0.62 0.66 503\n", + "\n", + " accuracy 0.69 2000\n", + " macro avg 0.69 0.66 0.67 2000\n", + "weighted avg 0.69 0.69 0.69 2000\n", + "\n", + "[[294 174 44]\n", + " [113 777 95]\n", + " [ 25 164 314]]\n", + "Model: Support Vector Machine\n", + " precision recall f1-score support\n", + "\n", + " -1 0.70 0.56 0.62 512\n", + " 0 0.68 0.84 0.75 985\n", + " 1 0.77 0.57 0.66 503\n", + "\n", + " accuracy 0.70 2000\n", + " macro avg 0.72 0.66 0.68 2000\n", + "weighted avg 0.71 0.70 0.69 2000\n", + "\n", + "[[285 200 27]\n", + " [ 99 827 59]\n", + " [ 22 194 287]]\n", + "Model: Multinomial Naive Bayes\n", + " precision recall f1-score support\n", + "\n", + " -1 0.67 0.59 0.63 512\n", + " 0 0.68 0.80 0.74 985\n", + " 1 0.71 0.55 0.62 503\n", + "\n", + " accuracy 0.69 2000\n", + " macro avg 0.69 0.65 0.66 2000\n", + "weighted avg 0.69 0.69 0.68 2000\n", + "\n", + "[[300 175 37]\n", + " [117 792 76]\n", + " [ 28 196 279]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# قراءة ملف CSV باستخدام Pandas\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "\n", + "# Preprocess text column\n", + "def preprocess_text(text):\n", + " text = text.lower()\n", + " text = re.sub('@[^\\s]+', ' ', text)\n", + " text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n", + " text = re.sub(f'[{string.punctuation}]', ' ', text)\n", + " text = re.sub('[0-9]+', '', text)\n", + " text = \" \".join(str(text).split())\n", + " text = [w for w in text.split() if w not in stopwords.words('english')]\n", + " text = \" \".join(text)\n", + " text = WordNetLemmatizer().lemmatize(text)\n", + " return text\n", + "\n", + "df['tweet'] = df['tweet'].apply(preprocess_text)\n", + "\n", + "# Split data into features and target\n", + "X = df['tweet']\n", + "y = df['label']\n", + "\n", + "# Initialize RandomForestClassifier\n", + "rf = RandomForestClassifier()\n", + "\n", + "# Define hyperparameters grid\n", + "param_grid = {\n", + " 'rf__n_estimators': [50, 100, 200],\n", + " 'rf__max_depth': [None, 10, 20],\n", + "}\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Define pipeline\n", + "pipeline = Pipeline([\n", + " ('vectorizer', TfidfVectorizer()),\n", + " ('rf', rf)\n", + "])\n", + "\n", + "# Perform hyperparameter tuning\n", + "grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search.fit(X, y)\n", + "\n", + "# Get best model and parameters\n", + "best_model = grid_search.best_estimator_\n", + "best_params = grid_search.best_params_\n", + "best_score = grid_search.best_score_\n", + "\n", + "print(\"Best parameters:\", best_params)\n", + "print(\"Best cross-validation accuracy:\", best_score)\n", + "\n", + "# Fit the best model\n", + "best_model.fit(X, y)\n", + "\n", + "# Predict using the best model\n", + "y_pred = best_model.predict(X)\n", + "\n", + "# Calculate accuracy\n", + "accuracy = accuracy_score(y, y_pred)\n", + "print(\"Final Accuracy:\", accuracy)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UhM3zb7S-XI6", + "outputId": "1934f8c1-a924-40d9-e840-f3c714e54e83" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + "Best parameters: {'rf__max_depth': None, 'rf__n_estimators': 200}\n", + "Best cross-validation accuracy: 0.6987\n", + "Final Accuracy: 0.9955\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#بدون معالجة\n", + "# Libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "import time\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "nltk.download('stopwords')\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "# Read CSV file\n", + "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n", + "print(df.head()) # Display the data\n", + "\n", + "# Split data into features and target (avoiding data leakage)\n", + "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)\n", + "\n", + "# Initialize KFold cross-validator\n", + "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n", + "\n", + "# Function to display results in a DataFrame\n", + "def display_results(results):\n", + " df_results = pd.DataFrame(results)\n", + " print(df_results)\n", + "\n", + "# Initialize an empty list to store results\n", + "results = []\n", + "\n", + "# Define hyperparameters grid for Random Forest\n", + "param_grid_rf = {\n", + " 'classifier__n_estimators': [50, 100, 200],\n", + " 'classifier__max_depth': [None, 10, 20]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Logistic Regression\n", + "param_grid_lr = {\n", + " 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n", + "}\n", + "\n", + "# Define hyperparameters grid for Support Vector Machine\n", + "param_grid_svm = {\n", + " 'classifier__C': [0.1, 1, 10],\n", + " 'classifier__gamma': ['scale', 'auto']\n", + "}\n", + "\n", + "# Define hyperparameters grid for Multinomial Naive Bayes\n", + "param_grid_nb = {\n", + " 'classifier__alpha': [0.1, 0.5, 1.0]\n", + "}\n", + "\n", + "# Random Forest\n", + "print(\"Tuning hyperparameters for Random Forest:\")\n", + "pipeline_rf = Pipeline([\n", + " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", + " ('classifier', RandomForestClassifier())\n", + "])\n", + "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_rf.fit(X_train, y_train)\n", + "best_params_rf = grid_search_rf.best_params_\n", + "best_score_rf = grid_search_rf.best_score_\n", + "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n", + "\n", + "# Logistic Regression\n", + "print(\"Tuning hyperparameters for Logistic Regression:\")\n", + "pipeline_lr = Pipeline([\n", + " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", + " ('classifier', LogisticRegression())\n", + "])\n", + "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_lr.fit(X_train, y_train)\n", + "best_params_lr = grid_search_lr.best_params_\n", + "best_score_lr = grid_search_lr.best_score_\n", + "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n", + "\n", + "# Support Vector Machine\n", + "print(\"Tuning hyperparameters for Support Vector Machine:\")\n", + "pipeline_svm = Pipeline([\n", + " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", + " ('classifier', SVC())\n", + "])\n", + "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_svm.fit(X_train, y_train)\n", + "best_params_svm = grid_search_svm.best_params_\n", + "best_score_svm = grid_search_svm.best_score_\n", + "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n", + "\n", + "# Multinomial Naive Bayes\n", + "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n", + "pipeline_nb = Pipeline([\n", + " ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n", + " ('classifier', MultinomialNB())\n", + "])\n", + "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n", + "grid_search_nb.fit(X_train, y_train)\n", + "best_params_nb = grid_search_nb.best_params_\n", + "best_score_nb = grid_search_nb.best_score_\n", + "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n", + "\n", + "# Display results\n", + "display_results(results)\n", + "\n", + "# Evaluate on test data (including additional metrics)\n", + "print(\"Performance on Test Set:\")\n", + "for model_name, model in [('Random Forest', grid_search_rf), ('Logistic Regression', grid_search_lr), ('Support Vector Machine', grid_search_svm), ('Multinomial Naive Bayes', grid_search_nb)]:\n", + " y_pred = model.predict(X_test)\n", + " print(f\"Model: {model_name}\")\n", + " print(classification_report(y_test, y_pred))\n", + " print(confusion_matrix(y_test, y_pred))\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ABQvR4RLKbwq", + "outputId": "3a5fdc25-85d3-43c1-b69d-6ae95fdce6bb" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", + " tweet label\n", + "0 BofA previews Netflixs NFLX Q3 Earnings Tues 0... 0\n", + "1 I scooped a couple of shares this morning at a... 0\n", + "2 Im streaming ES Futures using Bookmap on youtu... 0\n", + "3 CF taking some off here close to 19150 1\n", + "4 No change to this position is still bullish st... 0\n", + "Tuning hyperparameters for Random Forest:\n", + "Tuning hyperparameters for Logistic Regression:\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tuning hyperparameters for Support Vector Machine:\n", + "Tuning hyperparameters for Multinomial Naive Bayes:\n", + " Model Best Parameters \\\n", + "0 Random Forest {'classifier__max_depth': None, 'classifier__n... \n", + "1 Logistic Regression {'classifier__C': 10} \n", + "2 Support Vector Machine {'classifier__C': 10, 'classifier__gamma': 'sc... \n", + "3 Multinomial Naive Bayes {'classifier__alpha': 0.1} \n", + "\n", + " Best Cross-validation Accuracy \n", + "0 0.686875 \n", + "1 0.700625 \n", + "2 0.705000 \n", + "3 0.695625 \n", + "Performance on Test Set:\n", + "Model: Random Forest\n", + " precision recall f1-score support\n", + "\n", + " -1 0.71 0.51 0.59 512\n", + " 0 0.68 0.87 0.76 985\n", + " 1 0.76 0.56 0.64 503\n", + "\n", + " accuracy 0.70 2000\n", + " macro avg 0.72 0.65 0.67 2000\n", + "weighted avg 0.71 0.70 0.69 2000\n", + "\n", + "[[259 209 44]\n", + " [ 81 860 44]\n", + " [ 23 200 280]]\n", + "Model: Logistic Regression\n", + " precision recall f1-score support\n", + "\n", + " -1 0.68 0.59 0.63 512\n", + " 0 0.70 0.79 0.74 985\n", + " 1 0.70 0.63 0.66 503\n", + "\n", + " accuracy 0.70 2000\n", + " macro avg 0.69 0.67 0.68 2000\n", + "weighted avg 0.70 0.70 0.69 2000\n", + "\n", + "[[303 170 39]\n", + " [115 774 96]\n", + " [ 29 159 315]]\n", + "Model: Support Vector Machine\n", + " precision recall f1-score support\n", + "\n", + " -1 0.70 0.55 0.61 512\n", + " 0 0.68 0.84 0.75 985\n", + " 1 0.78 0.58 0.66 503\n", + "\n", + " accuracy 0.70 2000\n", + " macro avg 0.72 0.65 0.67 2000\n", + "weighted avg 0.71 0.70 0.69 2000\n", + "\n", + "[[281 206 25]\n", + " [100 826 59]\n", + " [ 23 190 290]]\n", + "Model: Multinomial Naive Bayes\n", + " precision recall f1-score support\n", + "\n", + " -1 0.66 0.61 0.63 512\n", + " 0 0.69 0.79 0.73 985\n", + " 1 0.69 0.55 0.62 503\n", + "\n", + " accuracy 0.68 2000\n", + " macro avg 0.68 0.65 0.66 2000\n", + "weighted avg 0.68 0.68 0.68 2000\n", + "\n", + "[[311 159 42]\n", + " [128 776 81]\n", + " [ 32 192 279]]\n" + ] + } + ] + } + ] +} \ No newline at end of file