{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "f7b7943b77ec48709ab2c14ff9c7d8d0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_8ccb497ccc254b4fbb83a6075d999c98",
              "IPY_MODEL_462e849b058049eb90bdec861c4c6733",
              "IPY_MODEL_433354ec1982417b8e182d982451cff8"
            ],
            "layout": "IPY_MODEL_79e9f2981ea4421a864c14f0c3c47c88"
          }
        },
        "8ccb497ccc254b4fbb83a6075d999c98": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_1f7ddf0908aa4a4bbf20c0a98c534802",
            "placeholder": "​",
            "style": "IPY_MODEL_a62aa5c3bacd439b88b66448048054de",
            "value": "config.json: 100%"
          }
        },
        "462e849b058049eb90bdec861c4c6733": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_29f23229640c4a3b978c630bed86b0a0",
            "max": 725,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_65e51c3b78c14a1db0b8b730534aa5b9",
            "value": 725
          }
        },
        "433354ec1982417b8e182d982451cff8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_108e829acb474ad7b9b5d4576fbfd340",
            "placeholder": "​",
            "style": "IPY_MODEL_7f94536dab254a8ebbce57f65a99511d",
            "value": " 725/725 [00:00&lt;00:00, 30.0kB/s]"
          }
        },
        "79e9f2981ea4421a864c14f0c3c47c88": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "1f7ddf0908aa4a4bbf20c0a98c534802": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a62aa5c3bacd439b88b66448048054de": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "29f23229640c4a3b978c630bed86b0a0": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "65e51c3b78c14a1db0b8b730534aa5b9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "108e829acb474ad7b9b5d4576fbfd340": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "7f94536dab254a8ebbce57f65a99511d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "e2dc2d66fe9d4c068b7400aa68166d82": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_c4fdbaab18b04bd092bc8327dc9384f9",
              "IPY_MODEL_76d175d827dc479da41cf4d195176a73",
              "IPY_MODEL_02d1a242c3c346f9ba00d4ef54632080"
            ],
            "layout": "IPY_MODEL_3afe49394a7b4e63abbfa6eea620054b"
          }
        },
        "c4fdbaab18b04bd092bc8327dc9384f9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_5cc5407d5562422b8b4306b9b4540ab8",
            "placeholder": "​",
            "style": "IPY_MODEL_2bbd4694331d42b69a279571576deb51",
            "value": "pytorch_model.bin: 100%"
          }
        },
        "76d175d827dc479da41cf4d195176a73": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_23c88c6f258a402288b6ffe041f61f0b",
            "max": 1425820242,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_8c75ac06d39f4bf58c4b2cdf513c47a1",
            "value": 1425820242
          }
        },
        "02d1a242c3c346f9ba00d4ef54632080": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ac9cf05fb66c4b28beebe6ce874a296b",
            "placeholder": "​",
            "style": "IPY_MODEL_5180b588b5f546a1a5c1b99842ad9bc4",
            "value": " 1.43G/1.43G [00:13&lt;00:00, 119MB/s]"
          }
        },
        "3afe49394a7b4e63abbfa6eea620054b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5cc5407d5562422b8b4306b9b4540ab8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "2bbd4694331d42b69a279571576deb51": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "23c88c6f258a402288b6ffe041f61f0b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "8c75ac06d39f4bf58c4b2cdf513c47a1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "ac9cf05fb66c4b28beebe6ce874a296b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5180b588b5f546a1a5c1b99842ad9bc4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "536c41525a424101a64e8c3c7fe6a1a7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_c95bb7e9112f49bbb4b266fe7462586b",
              "IPY_MODEL_3d53fbf327a74eccbdeb2bdd7e0e9ea6",
              "IPY_MODEL_f15764b0fbcc470da3735693598b03d0"
            ],
            "layout": "IPY_MODEL_6b9d84472ca14e869b0eedfd5faa853e"
          }
        },
        "c95bb7e9112f49bbb4b266fe7462586b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_e7c76fe4370b4674a6ada68197e5b6e1",
            "placeholder": "​",
            "style": "IPY_MODEL_521b25ffe3e6414d918d9c38dc0f6251",
            "value": "tokenizer_config.json: 100%"
          }
        },
        "3d53fbf327a74eccbdeb2bdd7e0e9ea6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_49e97c7601b1484cb32de9a14b66a80e",
            "max": 1070,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_a60aa23254dc402397580e8cfa14dde2",
            "value": 1070
          }
        },
        "f15764b0fbcc470da3735693598b03d0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_97016178d3b249398e3bbbd6902a1f45",
            "placeholder": "​",
            "style": "IPY_MODEL_e4203f38f1e34e9baa953248c5d04867",
            "value": " 1.07k/1.07k [00:00&lt;00:00, 52.3kB/s]"
          }
        },
        "6b9d84472ca14e869b0eedfd5faa853e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "e7c76fe4370b4674a6ada68197e5b6e1": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "521b25ffe3e6414d918d9c38dc0f6251": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "49e97c7601b1484cb32de9a14b66a80e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a60aa23254dc402397580e8cfa14dde2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "97016178d3b249398e3bbbd6902a1f45": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "e4203f38f1e34e9baa953248c5d04867": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "55fee200080e4ffa842c7b3ccba6202c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_681e215549a64a2d86046c20f943accb",
              "IPY_MODEL_585daf7d5699401284c3fe9991cfbd1d",
              "IPY_MODEL_0754f254add24b1bbac53908ace0674e"
            ],
            "layout": "IPY_MODEL_1c5dd19c1b994e20b6286f921d6e3a08"
          }
        },
        "681e215549a64a2d86046c20f943accb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_c05c707a897242cebb1844ba012ba823",
            "placeholder": "​",
            "style": "IPY_MODEL_3d05b61f2061429095a1a2d4dc7500dd",
            "value": "vocab.json: 100%"
          }
        },
        "585daf7d5699401284c3fe9991cfbd1d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_a8e4bf0b21184ccf9951bdb9114fd10e",
            "max": 898822,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_96934e1a081e480e9608f355cca41566",
            "value": 898822
          }
        },
        "0754f254add24b1bbac53908ace0674e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_68f47edfbd5d4bacaa3e6424c98f8961",
            "placeholder": "​",
            "style": "IPY_MODEL_ffa81a80e84546148817330dd5c9eefa",
            "value": " 899k/899k [00:00&lt;00:00, 3.38MB/s]"
          }
        },
        "1c5dd19c1b994e20b6286f921d6e3a08": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "c05c707a897242cebb1844ba012ba823": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "3d05b61f2061429095a1a2d4dc7500dd": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "a8e4bf0b21184ccf9951bdb9114fd10e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "96934e1a081e480e9608f355cca41566": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "68f47edfbd5d4bacaa3e6424c98f8961": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ffa81a80e84546148817330dd5c9eefa": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "c9d0683b6fb64df1bc370de4011fe77f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_d1dc02d934cc417b983dc13d80b22d31",
              "IPY_MODEL_cad51788127f440393bea3c2d165f42e",
              "IPY_MODEL_2eaee4ea34a647259f015fbd9190e821"
            ],
            "layout": "IPY_MODEL_710c585a886d4d17bd66ebef545b3371"
          }
        },
        "d1dc02d934cc417b983dc13d80b22d31": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_f2130c9ea479452699ce1389a74097e6",
            "placeholder": "​",
            "style": "IPY_MODEL_f7db554186144eb0807f71fc142580fa",
            "value": "merges.txt: 100%"
          }
        },
        "cad51788127f440393bea3c2d165f42e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_0c61a26cae2d4e2f851be13f51ee78c3",
            "max": 456318,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_ec49ebfb9b16430181b4738b01844692",
            "value": 456318
          }
        },
        "2eaee4ea34a647259f015fbd9190e821": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_b6f81ce13f774cc1a42ec659d50bfca3",
            "placeholder": "​",
            "style": "IPY_MODEL_ffc2e024df9241de9d7c22269a47fe96",
            "value": " 456k/456k [00:00&lt;00:00, 3.42MB/s]"
          }
        },
        "710c585a886d4d17bd66ebef545b3371": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "f2130c9ea479452699ce1389a74097e6": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "f7db554186144eb0807f71fc142580fa": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "0c61a26cae2d4e2f851be13f51ee78c3": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ec49ebfb9b16430181b4738b01844692": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "b6f81ce13f774cc1a42ec659d50bfca3": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ffc2e024df9241de9d7c22269a47fe96": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "c7581989b1d84bc5b453551440671767": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_f841b17df29b42799c740b305f44c3f5",
              "IPY_MODEL_fb3820553c8246f8a3f19d11f2929e26",
              "IPY_MODEL_9a0be6bf09a943dba3582e78661b64ce"
            ],
            "layout": "IPY_MODEL_97881b93a47e470eb9673ef97a28d741"
          }
        },
        "f841b17df29b42799c740b305f44c3f5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_696a8fb7186e4ad9bed904d79cc5da17",
            "placeholder": "​",
            "style": "IPY_MODEL_fb483bee70b04cd1bc2cef99941dfbe8",
            "value": "special_tokens_map.json: 100%"
          }
        },
        "fb3820553c8246f8a3f19d11f2929e26": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_dc610f57bb0d42a4970fd56e2beafd3e",
            "max": 772,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_77f9434bf9984bbf9d7e0b409f58c236",
            "value": 772
          }
        },
        "9a0be6bf09a943dba3582e78661b64ce": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_1b7e9249af884ecd89bdfe4da708ac38",
            "placeholder": "​",
            "style": "IPY_MODEL_c3e369bcece54aa4bb6baba02f6c4c30",
            "value": " 772/772 [00:00&lt;00:00, 35.6kB/s]"
          }
        },
        "97881b93a47e470eb9673ef97a28d741": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "696a8fb7186e4ad9bed904d79cc5da17": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "fb483bee70b04cd1bc2cef99941dfbe8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "dc610f57bb0d42a4970fd56e2beafd3e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "77f9434bf9984bbf9d7e0b409f58c236": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "1b7e9249af884ecd89bdfe4da708ac38": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "c3e369bcece54aa4bb6baba02f6c4c30": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import accuracy_score\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "\n",
        "# Preprocess text data\n",
        "df['tweet'] = df['tweet'].str.lower()\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub('@[^\\s]+', ' ', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub(f'[{string.punctuation}]', ' ', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub('[0-9]+', '', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(str(x).split()))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: [w for w in x.split() if w not in stopwords.words('english')])\n",
        "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: WordNetLemmatizer().lemmatize(x))\n",
        "\n",
        "# Fit the bag of words transformer to the text column\n",
        "bow_transformer = CountVectorizer().fit(df['tweet'])\n",
        "\n",
        "# Transform the text column to bag of words representation\n",
        "text_bow = bow_transformer.transform(df['tweet'])\n",
        "\n",
        "# Apply Tf-Idf transformer to the bag of words representation\n",
        "tfidf_transformer = TfidfTransformer().fit(text_bow)\n",
        "text_tfidf = tfidf_transformer.transform(text_bow)\n",
        "\n",
        "# Split the data into train and test sets\n",
        "X_train, X_test, y_train, y_test = train_test_split(text_tfidf, df['label'], test_size=0.2, random_state=42)\n",
        "\n",
        "# Initialize and train the model\n",
        "model = LogisticRegression()\n",
        "model.fit(X_train, y_train)\n",
        "\n",
        "# Predict using the test set\n",
        "y_pred = model.predict(X_test)\n",
        "\n",
        "# Calculate accuracy\n",
        "accuracy = accuracy_score(y_test, y_pred)\n",
        "print(\"Accuracy:\", accuracy)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ELI_EP93ws0o",
        "outputId": "0bae90bc-d511-4888-c507-cc5908f38a82"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Accuracy: 0.677\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
        "from sklearn.model_selection import train_test_split, KFold\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import accuracy_score\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from xgboost import XGBClassifier\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "\n",
        "# Preprocess text data\n",
        "df['tweet'] = df['tweet'].str.lower()\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub('@[^\\s]+', ' ', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub(f'[{string.punctuation}]', ' ', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: re.sub('[0-9]+', '', x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(str(x).split()))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: [w for w in x.split() if w not in stopwords.words('english')])\n",
        "df['tweet'] = df['tweet'].apply(lambda x: \" \".join(x))\n",
        "df['tweet'] = df['tweet'].apply(lambda x: WordNetLemmatizer().lemmatize(x))\n",
        "\n",
        "# Split the data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Initialize models\n",
        "models = {\n",
        "    'Logistic Regression': LogisticRegression(),\n",
        "    'Random Forest': RandomForestClassifier(),\n",
        "    'Support Vector Machine': SVC(),\n",
        "    'Multinomial Naive Bayes': MultinomialNB(),\n",
        "    'XGBoost': XGBClassifier()\n",
        "}\n",
        "\n",
        "# Apply K-Fold cross-validation\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "for model_name, model in models.items():\n",
        "    print(f\"Training {model_name}:\")\n",
        "    accuracies = []\n",
        "    for train_index, test_index in kf.split(X):\n",
        "        X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n",
        "        y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n",
        "\n",
        "        # Fit the bag of words transformer to the text column\n",
        "        bow_transformer = CountVectorizer().fit(X_train)\n",
        "        # Transform the text column to bag of words representation\n",
        "        text_bow_train = bow_transformer.transform(X_train)\n",
        "        text_bow_test = bow_transformer.transform(X_test)\n",
        "\n",
        "        # Apply Tf-Idf transformer to the bag of words representation\n",
        "        tfidf_transformer = TfidfTransformer().fit(text_bow_train)\n",
        "        text_tfidf_train = tfidf_transformer.transform(text_bow_train)\n",
        "        text_tfidf_test = tfidf_transformer.transform(text_bow_test)\n",
        "\n",
        "        # Train the model\n",
        "        model.fit(text_tfidf_train, y_train)\n",
        "\n",
        "        # Predict using the test set\n",
        "        y_pred = model.predict(text_tfidf_test)\n",
        "\n",
        "        # Calculate accuracy\n",
        "        accuracy = accuracy_score(y_test, y_pred)\n",
        "        accuracies.append(accuracy)\n",
        "        print(f\"  - Fold accuracy: {accuracy}\")\n",
        "\n",
        "    # Average accuracy across all folds\n",
        "    avg_accuracy = np.mean(accuracies)\n",
        "    print(f\"{model_name} average accuracy: {avg_accuracy}\\n\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 369
        },
        "id": "mo68I9qZ2zqB",
        "outputId": "3bff4613-0bfd-4d46-dd0c-3b70b976488c"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "error",
          "ename": "FileNotFoundError",
          "evalue": "[Errno 2] No such file or directory: '/content/drive/MyDrive/train (1).csv'",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-5-b5172d2f9851>\u001b[0m in \u001b[0;36m<cell line: 18>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;31m# Read the CSV file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/content/drive/MyDrive/train (1).csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[0;31m# Preprocess text data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m    910\u001b[0m     \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    911\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 912\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    913\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    914\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    575\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    576\u001b[0m     \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 577\u001b[0;31m     \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    578\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    579\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1405\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1406\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1407\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1409\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1659\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1660\u001b[0m                     \u001b[0mmode\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1661\u001b[0;31m             self.handles = get_handle(\n\u001b[0m\u001b[1;32m   1662\u001b[0m                 \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1663\u001b[0m                 \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    857\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    858\u001b[0m             \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m             handle = open(\n\u001b[0m\u001b[1;32m    860\u001b[0m                 \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    861\u001b[0m                 \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/drive/MyDrive/train (1).csv'"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "\n",
        "# Preprocess text column\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Split data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Initialize models\n",
        "models = {\n",
        "    'Random Forest': RandomForestClassifier(),\n",
        "    'Logistic Regression': LogisticRegression(),\n",
        "    'Support Vector Machine': SVC(),\n",
        "    'Multinomial Naive Bayes': MultinomialNB()\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for each model\n",
        "param_grid = {\n",
        "    'Random Forest': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20]},\n",
        "    'Logistic Regression': {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},\n",
        "    'Support Vector Machine': {'classifier__C': [0.1, 1, 10], 'classifier__gamma': ['scale', 'auto']},\n",
        "    'Multinomial Naive Bayes': {'classifier__alpha': [0.1, 0.5, 1.0]},\n",
        "}\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Perform hyperparameter tuning for each model\n",
        "for model_name, model in models.items():\n",
        "    print(f\"Tuning hyperparameters for {model_name}:\")\n",
        "\n",
        "    # Define pipeline with TfidfVectorizer and model\n",
        "    pipeline = Pipeline([\n",
        "        ('tfidf', TfidfVectorizer()),\n",
        "        ('classifier', model)\n",
        "    ])\n",
        "\n",
        "    # Perform grid search\n",
        "    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "    grid_search.fit(X, y)\n",
        "\n",
        "    # Print best parameters and best cross-validation accuracy\n",
        "    best_params = grid_search.best_params_\n",
        "    best_score = grid_search.best_score_\n",
        "    print(f\"Best parameters: {best_params}\")\n",
        "    print(f\"Best cross-validation accuracy: {best_score}\\n\")\n",
        "\n",
        "# Choose the best model\n",
        "best_model_name = max(models, key=lambda k: grid_search.cv_results_['mean_test_score'][np.argwhere(grid_search.cv_results_['rank_test_score'] == 1)[0][0]])\n",
        "best_model = grid_search.best_estimator_\n",
        "\n",
        "# Fit the best model\n",
        "best_model.fit(X, y)\n",
        "\n",
        "# Predict using the best model\n",
        "y_pred = best_model.predict(X)\n",
        "\n",
        "# Calculate accuracy\n",
        "accuracy = accuracy_score(y, y_pred)\n",
        "print(\"Final Accuracy:\", accuracy)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YxlqbMM74t1m",
        "outputId": "ecde4844-a8bd-42f9-ff3b-b88599c5c0cc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Random Forest:\n",
            "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}\n",
            "Best cross-validation accuracy: 0.6992999999999999\n",
            "\n",
            "Tuning hyperparameters for Logistic Regression:\n",
            "Best parameters: {'classifier__C': 10}\n",
            "Best cross-validation accuracy: 0.7055\n",
            "\n",
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n",
            "Best cross-validation accuracy: 0.7144999999999999\n",
            "\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n",
            "Best parameters: {'classifier__alpha': 0.1}\n",
            "Best cross-validation accuracy: 0.6957\n",
            "\n",
            "Final Accuracy: 0.9004\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#فصل اعدادات كل موديل\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "\n",
        "# Preprocess text column\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Split data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Define hyperparameters grid for Random Forest\n",
        "param_grid_rf = {\n",
        "    'classifier__n_estimators': [50, 100, 200],\n",
        "    'classifier__max_depth': [None, 10, 20]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Logistic Regression\n",
        "param_grid_lr = {\n",
        "    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Support Vector Machine\n",
        "param_grid_svm = {\n",
        "    'classifier__C': [0.1, 1, 10],\n",
        "    'classifier__gamma': ['scale', 'auto']\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Multinomial Naive Bayes\n",
        "param_grid_nb = {\n",
        "    'classifier__alpha': [0.1, 0.5, 1.0]\n",
        "}\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Random Forest\n",
        "print(\"Tuning hyperparameters for Random Forest:\")\n",
        "pipeline_rf = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', RandomForestClassifier())\n",
        "])\n",
        "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_rf.fit(X, y)\n",
        "best_params_rf = grid_search_rf.best_params_\n",
        "best_score_rf = grid_search_rf.best_score_\n",
        "print(f\"Best parameters: {best_params_rf}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_rf}\\n\")\n",
        "\n",
        "# Logistic Regression\n",
        "print(\"Tuning hyperparameters for Logistic Regression:\")\n",
        "pipeline_lr = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', LogisticRegression())\n",
        "])\n",
        "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_lr.fit(X, y)\n",
        "best_params_lr = grid_search_lr.best_params_\n",
        "best_score_lr = grid_search_lr.best_score_\n",
        "print(f\"Best parameters: {best_params_lr}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_lr}\\n\")\n",
        "\n",
        "# Support Vector Machine\n",
        "print(\"Tuning hyperparameters for Support Vector Machine:\")\n",
        "pipeline_svm = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', SVC())\n",
        "])\n",
        "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_svm.fit(X, y)\n",
        "best_params_svm = grid_search_svm.best_params_\n",
        "best_score_svm = grid_search_svm.best_score_\n",
        "print(f\"Best parameters: {best_params_svm}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_svm}\\n\")\n",
        "\n",
        "# Multinomial Naive Bayes\n",
        "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n",
        "pipeline_nb = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', MultinomialNB())\n",
        "])\n",
        "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_nb.fit(X, y)\n",
        "best_params_nb = grid_search_nb.best_params_\n",
        "best_score_nb = grid_search_nb.best_score_\n",
        "print(f\"Best parameters: {best_params_nb}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_nb}\\n\")\n",
        "\n",
        "# Choose the best model\n",
        "best_model_name = max({\n",
        "    'Random Forest': best_score_rf,\n",
        "    'Logistic Regression': best_score_lr,\n",
        "    'Support Vector Machine': best_score_svm,\n",
        "    'Multinomial Naive Bayes': best_score_nb\n",
        "}.items(), key=lambda x: x[1])[0]\n",
        "best_model = None\n",
        "\n",
        "if best_model_name == 'Random Forest':\n",
        "    best_model = grid_search_rf.best_estimator_\n",
        "elif best_model_name == 'Logistic Regression':\n",
        "    best_model = grid_search_lr.best_estimator_\n",
        "elif best_model_name == 'Support Vector Machine':\n",
        "    best_model = grid_search_svm.best_estimator_\n",
        "elif best_model_name == 'Multinomial Naive Bayes':\n",
        "    best_model = grid_search_nb.best_estimator_\n",
        "\n",
        "# Fit the best model\n",
        "if best_model:\n",
        "    best_model.fit(X, y)\n",
        "    # Predict using the best model\n",
        "    y_pred = best_model.predict(X)\n",
        "    # Calculate accuracy\n",
        "    accuracy = accuracy_score(y, y_pred)\n",
        "    print(\"Final Accuracy:\", accuracy)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "eOKgOLHmFc8u",
        "outputId": "93efecb0-a9c5-4497-9544-2959fa534f4d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Random Forest:\n",
            "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}\n",
            "Best cross-validation accuracy: 0.6977\n",
            "\n",
            "Tuning hyperparameters for Logistic Regression:\n",
            "Best parameters: {'classifier__C': 10}\n",
            "Best cross-validation accuracy: 0.7055\n",
            "\n",
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n",
            "Best cross-validation accuracy: 0.7144999999999999\n",
            "\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n",
            "Best parameters: {'classifier__alpha': 0.1}\n",
            "Best cross-validation accuracy: 0.6957\n",
            "\n",
            "Final Accuracy: 0.9954\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
        "import time\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "nltk.download('stopwords')\n",
        "nltk.download('wordnet')\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "df  # إظهار البيانات\n",
        "\n",
        "# Preprocess text column\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Split data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Define hyperparameters grid for Random Forest\n",
        "param_grid_rf = {\n",
        "    'classifier__n_estimators': [50, 100, 200],\n",
        "    'classifier__max_depth': [None, 10, 20]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Logistic Regression\n",
        "param_grid_lr = {\n",
        "    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Support Vector Machine\n",
        "param_grid_svm = {\n",
        "    'classifier__C': [0.1, 1, 10],\n",
        "    'classifier__gamma': ['scale', 'auto']\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Multinomial Naive Bayes\n",
        "param_grid_nb = {\n",
        "    'classifier__alpha': [0.1, 0.5, 1.0]\n",
        "}\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Random Forest\n",
        "print(\"Tuning hyperparameters for Random Forest:\")\n",
        "pipeline_rf = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', RandomForestClassifier())\n",
        "])\n",
        "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_rf.fit(X, y)\n",
        "best_params_rf = grid_search_rf.best_params_\n",
        "best_score_rf = grid_search_rf.best_score_\n",
        "print(f\"Best parameters: {best_params_rf}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_rf}\\n\")\n",
        "\n",
        "# Logistic Regression\n",
        "print(\"Tuning hyperparameters for Logistic Regression:\")\n",
        "pipeline_lr = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', LogisticRegression())\n",
        "])\n",
        "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_lr.fit(X, y)\n",
        "best_params_lr = grid_search_lr.best_params_\n",
        "best_score_lr = grid_search_lr.best_score_\n",
        "print(f\"Best parameters: {best_params_lr}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_lr}\\n\")\n",
        "\n",
        "# Support Vector Machine\n",
        "print(\"Tuning hyperparameters for Support Vector Machine:\")\n",
        "pipeline_svm = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', SVC())\n",
        "])\n",
        "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_svm.fit(X, y)\n",
        "best_params_svm = grid_search_svm.best_params_\n",
        "best_score_svm = grid_search_svm.best_score_\n",
        "print(f\"Best parameters: {best_params_svm}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_svm}\\n\")\n",
        "\n",
        "# Multinomial Naive Bayes\n",
        "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n",
        "pipeline_nb = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', MultinomialNB())\n",
        "])\n",
        "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_nb.fit(X, y)\n",
        "best_params_nb = grid_search_nb.best_params_\n",
        "best_score_nb = grid_search_nb.best_score_\n",
        "print(f\"Best parameters: {best_params_nb}\")\n",
        "print(f\"Best cross-validation accuracy: {best_score_nb}\\n\")\n",
        "\n",
        "# Choose the best model\n",
        "best_model_name = max({\n",
        "    'Random Forest': best_score_rf,\n",
        "    'Logistic Regression': best_score_lr,\n",
        "    'Support Vector Machine': best_score_svm,\n",
        "    'Multinomial Naive Bayes': best_score_nb\n",
        "}.items(), key=lambda x: x[1])[0]\n",
        "best_model = None\n",
        "\n",
        "if best_model_name == 'Random Forest':\n",
        "    best_model = grid_search_rf.best_estimator_\n",
        "elif best_model_name == 'Logistic Regression':\n",
        "    best_model = grid_search_lr.best_estimator_\n",
        "elif best_model_name == 'Support Vector Machine':\n",
        "    best_model = grid_search_svm.best_estimator_\n",
        "elif best_model_name == 'Multinomial Naive Bayes':\n",
        "    best_model = grid_search_nb.best_estimator_\n",
        "\n",
        "# Fit the best model\n",
        "if best_model:\n",
        "    best_model.fit(X, y)\n",
        "    # Predict using the best model\n",
        "    y_pred = best_model.predict(X)\n",
        "    # Calculate accuracy\n",
        "    accuracy = accuracy_score(y, y_pred)\n",
        "    print(\"Final Accuracy:\", accuracy)\n",
        "\n",
        "    # Calculate and print confusion matrix and classification report\n",
        "    cm = confusion_matrix(y, y_pred)\n",
        "    cr = classification_report(y, y_pred)\n",
        "    print(\"Confusion Matrix:\")\n",
        "    print(cm)\n",
        "    print(\"Classification Report:\")\n",
        "    print(cr)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xOXg8UEbC-kj",
        "outputId": "409ac90e-a68d-451b-b299-b3322f54803f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n",
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
            "Tuning hyperparameters for Random Forest:\n",
            "Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}\n",
            "Best cross-validation accuracy: 0.6995999999999999\n",
            "\n",
            "Tuning hyperparameters for Logistic Regression:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
            "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
            "\n",
            "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
            "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
            "Please also refer to the documentation for alternative solver options:\n",
            "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
            "  n_iter_i = _check_optimize_result(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Best parameters: {'classifier__C': 10}\n",
            "Best cross-validation accuracy: 0.7055\n",
            "\n",
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Best parameters: {'classifier__C': 10, 'classifier__gamma': 'scale'}\n",
            "Best cross-validation accuracy: 0.7144999999999999\n",
            "\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n",
            "Best parameters: {'classifier__alpha': 0.1}\n",
            "Best cross-validation accuracy: 0.6957\n",
            "\n",
            "Final Accuracy: 0.9954\n",
            "Confusion Matrix:\n",
            "[[2423    8    1]\n",
            " [   4 4966   12]\n",
            " [   0   21 2565]]\n",
            "Classification Report:\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       1.00      1.00      1.00      2432\n",
            "           0       0.99      1.00      1.00      4982\n",
            "           1       0.99      0.99      0.99      2586\n",
            "\n",
            "    accuracy                           1.00     10000\n",
            "   macro avg       1.00      0.99      1.00     10000\n",
            "weighted avg       1.00      1.00      1.00     10000\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#الكود الصح مبدئياً\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
        "import time\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "nltk.download('stopwords')\n",
        "nltk.download('wordnet')\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "df  # إظهار البيانات\n",
        "\n",
        "# Preprocess text column\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Split data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Define hyperparameters grid for Random Forest\n",
        "param_grid_rf = {\n",
        "    'classifier__n_estimators': [50, 100, 200],\n",
        "    'classifier__max_depth': [None, 10, 20]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Logistic Regression\n",
        "param_grid_lr = {\n",
        "    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Support Vector Machine\n",
        "param_grid_svm = {\n",
        "    'classifier__C': [0.1, 1, 10],\n",
        "    'classifier__gamma': ['scale', 'auto']\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Multinomial Naive Bayes\n",
        "param_grid_nb = {\n",
        "    'classifier__alpha': [0.1, 0.5, 1.0]\n",
        "}\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Function to display results in a DataFrame\n",
        "def display_results(results):\n",
        "    df_results = pd.DataFrame(results)\n",
        "    display(df_results)\n",
        "\n",
        "# Initialize an empty list to store results\n",
        "results = []\n",
        "\n",
        "# Random Forest\n",
        "print(\"Tuning hyperparameters for Random Forest:\")\n",
        "pipeline_rf = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', RandomForestClassifier())\n",
        "])\n",
        "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_rf.fit(X, y)\n",
        "best_params_rf = grid_search_rf.best_params_\n",
        "best_score_rf = grid_search_rf.best_score_\n",
        "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n",
        "\n",
        "# Logistic Regression\n",
        "print(\"Tuning hyperparameters for Logistic Regression:\")\n",
        "pipeline_lr = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', LogisticRegression())\n",
        "])\n",
        "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_lr.fit(X, y)\n",
        "best_params_lr = grid_search_lr.best_params_\n",
        "best_score_lr = grid_search_lr.best_score_\n",
        "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n",
        "\n",
        "# Support Vector Machine\n",
        "print(\"Tuning hyperparameters for Support Vector Machine:\")\n",
        "pipeline_svm = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', SVC())\n",
        "])\n",
        "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_svm.fit(X, y)\n",
        "best_params_svm = grid_search_svm.best_params_\n",
        "best_score_svm = grid_search_svm.best_score_\n",
        "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n",
        "\n",
        "# Multinomial Naive Bayes\n",
        "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n",
        "pipeline_nb = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', MultinomialNB())\n",
        "])\n",
        "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_nb.fit(X, y)\n",
        "best_params_nb = grid_search_nb.best_params_\n",
        "best_score_nb = grid_search_nb.best_score_\n",
        "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n",
        "\n",
        "# Display results\n",
        "display_results(results)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 482
        },
        "id": "rWMIvJ3JJUpT",
        "outputId": "0ab4308e-3d57-47af-aaca-534e68fe43a6"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n",
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
            "[nltk_data]   Package wordnet is already up-to-date!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
            "Tuning hyperparameters for Random Forest:\n",
            "Tuning hyperparameters for Logistic Regression:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
            "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
            "\n",
            "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
            "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
            "Please also refer to the documentation for alternative solver options:\n",
            "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
            "  n_iter_i = _check_optimize_result(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "                     Model                                    Best Parameters  \\\n",
              "0            Random Forest  {'classifier__max_depth': None, 'classifier__n...   \n",
              "1      Logistic Regression                              {'classifier__C': 10}   \n",
              "2   Support Vector Machine  {'classifier__C': 10, 'classifier__gamma': 'sc...   \n",
              "3  Multinomial Naive Bayes                         {'classifier__alpha': 0.1}   \n",
              "\n",
              "   Best Cross-validation Accuracy  \n",
              "0                          0.6981  \n",
              "1                          0.7055  \n",
              "2                          0.7145  \n",
              "3                          0.6957  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-5983e431-3c02-498d-8557-32e687733d6e\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Model</th>\n",
              "      <th>Best Parameters</th>\n",
              "      <th>Best Cross-validation Accuracy</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Random Forest</td>\n",
              "      <td>{'classifier__max_depth': None, 'classifier__n...</td>\n",
              "      <td>0.6981</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Logistic Regression</td>\n",
              "      <td>{'classifier__C': 10}</td>\n",
              "      <td>0.7055</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Support Vector Machine</td>\n",
              "      <td>{'classifier__C': 10, 'classifier__gamma': 'sc...</td>\n",
              "      <td>0.7145</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Multinomial Naive Bayes</td>\n",
              "      <td>{'classifier__alpha': 0.1}</td>\n",
              "      <td>0.6957</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5983e431-3c02-498d-8557-32e687733d6e')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-5983e431-3c02-498d-8557-32e687733d6e button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-5983e431-3c02-498d-8557-32e687733d6e');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-83e34f34-9a16-4196-b4e6-cb678a877307\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-83e34f34-9a16-4196-b4e6-cb678a877307')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-83e34f34-9a16-4196-b4e6-cb678a877307 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"display_results(results)\",\n  \"rows\": 4,\n  \"fields\": [\n    {\n      \"column\": \"Model\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          \"Logistic Regression\",\n          \"Multinomial Naive Bayes\",\n          \"Random Forest\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Best Parameters\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Best Cross-validation Accuracy\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.008465419855703142,\n        \"min\": 0.6957,\n        \"max\": 0.7144999999999999,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          0.7055,\n          0.6957,\n          0.6980999999999999\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#using model from hugging face\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
        "import time\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "nltk.download('stopwords')\n",
        "nltk.download('wordnet')\n",
        "\n",
        "from transformers import pipeline\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# Read the CSV file using Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "print(df.head())  # Display the data\n",
        "\n",
        "# Preprocess the text column\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Initialize sentiment analysis pipeline\n",
        "sentiment_pipeline = pipeline(\"text-classification\", model=\"j-hartmann/sentiment-roberta-large-english-3-classes\")\n",
        "\n",
        "# Apply sentiment analysis to the text column\n",
        "sentiment_predictions = sentiment_pipeline(df['tweet'].tolist())\n",
        "\n",
        "# Add the sentiment predictions to the DataFrame\n",
        "df['sentiment'] = [sent['label'] for sent in sentiment_predictions]\n",
        "\n",
        "# Split data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Define hyperparameters grid for Random Forest\n",
        "param_grid_rf = {\n",
        "    'classifier__n_estimators': [50, 100, 200],\n",
        "    'classifier__max_depth': [None, 10, 20]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Logistic Regression\n",
        "param_grid_lr = {\n",
        "    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Support Vector Machine\n",
        "param_grid_svm = {\n",
        "    'classifier__C': [0.1, 1, 10],\n",
        "    'classifier__gamma': ['scale', 'auto']\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Multinomial Naive Bayes\n",
        "param_grid_nb = {\n",
        "    'classifier__alpha': [0.1, 0.5, 1.0]\n",
        "}\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Function to display results in a DataFrame\n",
        "def display_results(results):\n",
        "    df_results = pd.DataFrame(results)\n",
        "    display(df_results)\n",
        "\n",
        "# Initialize an empty list to store results\n",
        "results = []\n",
        "\n",
        "# Random Forest\n",
        "print(\"Tuning hyperparameters for Random Forest:\")\n",
        "pipeline_rf = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', RandomForestClassifier())\n",
        "])\n",
        "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_rf.fit(X, y)\n",
        "best_params_rf = grid_search_rf.best_params_\n",
        "best_score_rf = grid_search_rf.best_score_\n",
        "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n",
        "\n",
        "# Logistic Regression\n",
        "print(\"Tuning hyperparameters for Logistic Regression:\")\n",
        "pipeline_lr = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', LogisticRegression())\n",
        "])\n",
        "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_lr.fit(X, y)\n",
        "best_params_lr = grid_search_lr.best_params_\n",
        "best_score_lr = grid_search_lr.best_score_\n",
        "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n",
        "\n",
        "# Support Vector Machine\n",
        "print(\"Tuning hyperparameters for Support Vector Machine:\")\n",
        "pipeline_svm = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', SVC())\n",
        "])\n",
        "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_svm.fit(X, y)\n",
        "best_params_svm = grid_search_svm.best_params_\n",
        "best_score_svm = grid_search_svm.best_score_\n",
        "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n",
        "\n",
        "# Multinomial Naive Bayes\n",
        "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n",
        "pipeline_nb = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', MultinomialNB())\n",
        "])\n",
        "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_nb.fit(X, y)\n",
        "best_params_nb = grid_search_nb.best_params_\n",
        "best_score_nb = grid_search_nb.best_score_\n",
        "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n",
        "\n",
        "# Display results\n",
        "display_results(results)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 984,
          "referenced_widgets": [
            "f7b7943b77ec48709ab2c14ff9c7d8d0",
            "8ccb497ccc254b4fbb83a6075d999c98",
            "462e849b058049eb90bdec861c4c6733",
            "433354ec1982417b8e182d982451cff8",
            "79e9f2981ea4421a864c14f0c3c47c88",
            "1f7ddf0908aa4a4bbf20c0a98c534802",
            "a62aa5c3bacd439b88b66448048054de",
            "29f23229640c4a3b978c630bed86b0a0",
            "65e51c3b78c14a1db0b8b730534aa5b9",
            "108e829acb474ad7b9b5d4576fbfd340",
            "7f94536dab254a8ebbce57f65a99511d",
            "e2dc2d66fe9d4c068b7400aa68166d82",
            "c4fdbaab18b04bd092bc8327dc9384f9",
            "76d175d827dc479da41cf4d195176a73",
            "02d1a242c3c346f9ba00d4ef54632080",
            "3afe49394a7b4e63abbfa6eea620054b",
            "5cc5407d5562422b8b4306b9b4540ab8",
            "2bbd4694331d42b69a279571576deb51",
            "23c88c6f258a402288b6ffe041f61f0b",
            "8c75ac06d39f4bf58c4b2cdf513c47a1",
            "ac9cf05fb66c4b28beebe6ce874a296b",
            "5180b588b5f546a1a5c1b99842ad9bc4",
            "536c41525a424101a64e8c3c7fe6a1a7",
            "c95bb7e9112f49bbb4b266fe7462586b",
            "3d53fbf327a74eccbdeb2bdd7e0e9ea6",
            "f15764b0fbcc470da3735693598b03d0",
            "6b9d84472ca14e869b0eedfd5faa853e",
            "e7c76fe4370b4674a6ada68197e5b6e1",
            "521b25ffe3e6414d918d9c38dc0f6251",
            "49e97c7601b1484cb32de9a14b66a80e",
            "a60aa23254dc402397580e8cfa14dde2",
            "97016178d3b249398e3bbbd6902a1f45",
            "e4203f38f1e34e9baa953248c5d04867",
            "55fee200080e4ffa842c7b3ccba6202c",
            "681e215549a64a2d86046c20f943accb",
            "585daf7d5699401284c3fe9991cfbd1d",
            "0754f254add24b1bbac53908ace0674e",
            "1c5dd19c1b994e20b6286f921d6e3a08",
            "c05c707a897242cebb1844ba012ba823",
            "3d05b61f2061429095a1a2d4dc7500dd",
            "a8e4bf0b21184ccf9951bdb9114fd10e",
            "96934e1a081e480e9608f355cca41566",
            "68f47edfbd5d4bacaa3e6424c98f8961",
            "ffa81a80e84546148817330dd5c9eefa",
            "c9d0683b6fb64df1bc370de4011fe77f",
            "d1dc02d934cc417b983dc13d80b22d31",
            "cad51788127f440393bea3c2d165f42e",
            "2eaee4ea34a647259f015fbd9190e821",
            "710c585a886d4d17bd66ebef545b3371",
            "f2130c9ea479452699ce1389a74097e6",
            "f7db554186144eb0807f71fc142580fa",
            "0c61a26cae2d4e2f851be13f51ee78c3",
            "ec49ebfb9b16430181b4738b01844692",
            "b6f81ce13f774cc1a42ec659d50bfca3",
            "ffc2e024df9241de9d7c22269a47fe96",
            "c7581989b1d84bc5b453551440671767",
            "f841b17df29b42799c740b305f44c3f5",
            "fb3820553c8246f8a3f19d11f2929e26",
            "9a0be6bf09a943dba3582e78661b64ce",
            "97881b93a47e470eb9673ef97a28d741",
            "696a8fb7186e4ad9bed904d79cc5da17",
            "fb483bee70b04cd1bc2cef99941dfbe8",
            "dc610f57bb0d42a4970fd56e2beafd3e",
            "77f9434bf9984bbf9d7e0b409f58c236",
            "1b7e9249af884ecd89bdfe4da708ac38",
            "c3e369bcece54aa4bb6baba02f6c4c30"
          ]
        },
        "id": "gTDYJRFtQ0Xm",
        "outputId": "637b27dc-3121-4534-84a1-1ee31e97a5ab"
      },
      "execution_count": null,
      "outputs": [
        {
          "metadata": {
            "tags": null
          },
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n"
          ]
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n",
            "                                               tweet  label\n",
            "0  BofA previews Netflixs NFLX Q3 Earnings Tues 0...      0\n",
            "1  I scooped a couple of shares this morning at a...      0\n",
            "2  Im streaming ES Futures using Bookmap on youtu...      0\n",
            "3             CF taking some off here close to 19150      1\n",
            "4  No change to this position is still bullish st...      0\n"
          ]
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
            "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
            "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
            "You will be able to reuse this secret in all of your notebooks.\n",
            "Please note that authentication is recommended but still optional to access public models or datasets.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "f7b7943b77ec48709ab2c14ff9c7d8d0",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "e2dc2d66fe9d4c068b7400aa68166d82",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
            "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "536c41525a424101a64e8c3c7fe6a1a7",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "tokenizer_config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "55fee200080e4ffa842c7b3ccba6202c",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "c9d0683b6fb64df1bc370de4011fe77f",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "c7581989b1d84bc5b453551440671767",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Random Forest:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
            "  pid = os.fork()\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Logistic Regression:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
            "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
            "\n",
            "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
            "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
            "Please also refer to the documentation for alternative solver options:\n",
            "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
            "  n_iter_i = _check_optimize_result(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "                     Model                                    Best Parameters  \\\n",
              "0            Random Forest  {'classifier__max_depth': None, 'classifier__n...   \n",
              "1      Logistic Regression                              {'classifier__C': 10}   \n",
              "2   Support Vector Machine  {'classifier__C': 10, 'classifier__gamma': 'sc...   \n",
              "3  Multinomial Naive Bayes                         {'classifier__alpha': 0.1}   \n",
              "\n",
              "   Best Cross-validation Accuracy  \n",
              "0                          0.6981  \n",
              "1                          0.7055  \n",
              "2                          0.7145  \n",
              "3                          0.6957  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-7946890c-009f-458c-92f6-02461444ffb3\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Model</th>\n",
              "      <th>Best Parameters</th>\n",
              "      <th>Best Cross-validation Accuracy</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Random Forest</td>\n",
              "      <td>{'classifier__max_depth': None, 'classifier__n...</td>\n",
              "      <td>0.6981</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Logistic Regression</td>\n",
              "      <td>{'classifier__C': 10}</td>\n",
              "      <td>0.7055</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Support Vector Machine</td>\n",
              "      <td>{'classifier__C': 10, 'classifier__gamma': 'sc...</td>\n",
              "      <td>0.7145</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Multinomial Naive Bayes</td>\n",
              "      <td>{'classifier__alpha': 0.1}</td>\n",
              "      <td>0.6957</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7946890c-009f-458c-92f6-02461444ffb3')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-7946890c-009f-458c-92f6-02461444ffb3 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-7946890c-009f-458c-92f6-02461444ffb3');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-a7d8f06d-73af-471e-af96-e732efbc7082\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-a7d8f06d-73af-471e-af96-e732efbc7082')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-a7d8f06d-73af-471e-af96-e732efbc7082 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"display_results(results)\",\n  \"rows\": 4,\n  \"fields\": [\n    {\n      \"column\": \"Model\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          \"Logistic Regression\",\n          \"Multinomial Naive Bayes\",\n          \"Random Forest\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Best Parameters\",\n      \"properties\": {\n        \"dtype\": \"object\",\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Best Cross-validation Accuracy\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.008465419855703142,\n        \"min\": 0.6957,\n        \"max\": 0.7144999999999999,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          0.7055,\n          0.6957,\n          0.6980999999999999\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#تعديل على الكود #using model from hugging face\n",
        "\n",
        "# Libraries\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
        "import time\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "nltk.download('stopwords')\n",
        "nltk.download('wordnet')\n",
        "\n",
        "from transformers import pipeline\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# Read CSV file\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "print(df.head())  # Display the data\n",
        "\n",
        "# Preprocess text function\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Split data into features and target (avoiding data leakage)\n",
        "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Function to display results in a DataFrame\n",
        "def display_results(results):\n",
        "    df_results = pd.DataFrame(results)\n",
        "    print(df_results)\n",
        "\n",
        "# Initialize an empty list to store results\n",
        "results = []\n",
        "\n",
        "# Define hyperparameters grid for Random Forest\n",
        "param_grid_rf = {\n",
        "  'classifier__n_estimators': [50, 100, 200],\n",
        "  'classifier__max_depth': [None, 10, 20]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Logistic Regression\n",
        "param_grid_lr = {\n",
        "  'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Support Vector Machine\n",
        "param_grid_svm = {\n",
        "  'classifier__C': [0.1, 1, 10],\n",
        "  'classifier__gamma': ['scale', 'auto']\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Multinomial Naive Bayes\n",
        "param_grid_nb = {\n",
        "  'classifier__alpha': [0.1, 0.5, 1.0]\n",
        "}\n",
        "\n",
        "# Random Forest\n",
        "print(\"Tuning hyperparameters for Random Forest:\")\n",
        "pipeline_rf = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', RandomForestClassifier())\n",
        "])\n",
        "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_rf.fit(X_train, y_train)\n",
        "best_params_rf = grid_search_rf.best_params_\n",
        "best_score_rf = grid_search_rf.best_score_\n",
        "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n",
        "\n",
        "# Logistic Regression\n",
        "print(\"Tuning hyperparameters for Logistic Regression:\")\n",
        "pipeline_lr = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', LogisticRegression())\n",
        "])\n",
        "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_lr.fit(X_train, y_train)\n",
        "best_params_lr = grid_search_lr.best_params_\n",
        "best_score_lr = grid_search_lr.best_score_\n",
        "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n",
        "\n",
        "# Support Vector Machine\n",
        "print(\"Tuning hyperparameters for Support Vector Machine:\")\n",
        "pipeline_svm = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', SVC())\n",
        "])\n",
        "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_svm.fit(X_train, y_train)\n",
        "best_params_svm = grid_search_svm.best_params_\n",
        "best_score_svm = grid_search_svm.best_score_\n",
        "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n",
        "\n",
        "# Multinomial Naive Bayes\n",
        "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n",
        "pipeline_nb = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer()),\n",
        "    ('classifier', MultinomialNB())\n",
        "])\n",
        "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_nb.fit(X_train, y_train)\n",
        "best_params_nb = grid_search_nb.best_params_\n",
        "best_score_nb = grid_search_nb.best_score_\n",
        "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n",
        "\n",
        "# Display results\n",
        "display_results(results)\n",
        "\n",
        "# Evaluate on test data (including additional metrics)\n",
        "print(\"Performance on Test Set:\")\n",
        "for model_name, model in [('Random Forest', grid_search_rf), ('Logistic Regression', grid_search_lr), ('Support Vector Machine', grid_search_svm), ('Multinomial Naive Bayes', grid_search_nb)]:\n",
        "  y_pred = model.predict(X_test)\n",
        "  print(f\"Model: {model_name}\")\n",
        "  print(classification_report(y_test, y_pred))\n",
        "  print(confusion_matrix(y_test, y_pred))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "D1PMZ1g5ts_Z",
        "outputId": "5044b9c4-7106-4ffa-f455-1d383e96bea8"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n",
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
            "[nltk_data]   Package wordnet is already up-to-date!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
            "                                               tweet  label\n",
            "0  BofA previews Netflixs NFLX Q3 Earnings Tues 0...      0\n",
            "1  I scooped a couple of shares this morning at a...      0\n",
            "2  Im streaming ES Futures using Bookmap on youtu...      0\n",
            "3             CF taking some off here close to 19150      1\n",
            "4  No change to this position is still bullish st...      0\n",
            "Tuning hyperparameters for Random Forest:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
            "  pid = os.fork()\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Logistic Regression:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
            "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
            "\n",
            "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
            "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
            "Please also refer to the documentation for alternative solver options:\n",
            "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
            "  n_iter_i = _check_optimize_result(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n",
            "                     Model                                    Best Parameters  \\\n",
            "0            Random Forest  {'classifier__max_depth': None, 'classifier__n...   \n",
            "1      Logistic Regression                              {'classifier__C': 10}   \n",
            "2   Support Vector Machine  {'classifier__C': 10, 'classifier__gamma': 'sc...   \n",
            "3  Multinomial Naive Bayes                         {'classifier__alpha': 0.1}   \n",
            "\n",
            "   Best Cross-validation Accuracy  \n",
            "0                        0.691000  \n",
            "1                        0.698375  \n",
            "2                        0.707000  \n",
            "3                        0.689750  \n",
            "Performance on Test Set:\n",
            "Model: Random Forest\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.70      0.51      0.59       512\n",
            "           0       0.68      0.86      0.76       985\n",
            "           1       0.78      0.58      0.67       503\n",
            "\n",
            "    accuracy                           0.70      2000\n",
            "   macro avg       0.72      0.65      0.67      2000\n",
            "weighted avg       0.71      0.70      0.69      2000\n",
            "\n",
            "[[260 218  34]\n",
            " [ 90 847  48]\n",
            " [ 22 188 293]]\n",
            "Model: Logistic Regression\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.68      0.57      0.62       512\n",
            "           0       0.70      0.79      0.74       985\n",
            "           1       0.69      0.62      0.66       503\n",
            "\n",
            "    accuracy                           0.69      2000\n",
            "   macro avg       0.69      0.66      0.67      2000\n",
            "weighted avg       0.69      0.69      0.69      2000\n",
            "\n",
            "[[294 174  44]\n",
            " [113 777  95]\n",
            " [ 25 164 314]]\n",
            "Model: Support Vector Machine\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.70      0.56      0.62       512\n",
            "           0       0.68      0.84      0.75       985\n",
            "           1       0.77      0.57      0.66       503\n",
            "\n",
            "    accuracy                           0.70      2000\n",
            "   macro avg       0.72      0.66      0.68      2000\n",
            "weighted avg       0.71      0.70      0.69      2000\n",
            "\n",
            "[[285 200  27]\n",
            " [ 99 827  59]\n",
            " [ 22 194 287]]\n",
            "Model: Multinomial Naive Bayes\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.67      0.59      0.63       512\n",
            "           0       0.68      0.80      0.74       985\n",
            "           1       0.71      0.55      0.62       503\n",
            "\n",
            "    accuracy                           0.69      2000\n",
            "   macro avg       0.69      0.65      0.66      2000\n",
            "weighted avg       0.69      0.69      0.68      2000\n",
            "\n",
            "[[300 175  37]\n",
            " [117 792  76]\n",
            " [ 28 196 279]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.metrics import accuracy_score\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# قراءة ملف CSV باستخدام Pandas\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "\n",
        "# Preprocess text column\n",
        "def preprocess_text(text):\n",
        "    text = text.lower()\n",
        "    text = re.sub('@[^\\s]+', ' ', text)\n",
        "    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', ' ', text)\n",
        "    text = re.sub(f'[{string.punctuation}]', ' ', text)\n",
        "    text = re.sub('[0-9]+', '', text)\n",
        "    text = \" \".join(str(text).split())\n",
        "    text = [w for w in text.split() if w not in stopwords.words('english')]\n",
        "    text = \" \".join(text)\n",
        "    text = WordNetLemmatizer().lemmatize(text)\n",
        "    return text\n",
        "\n",
        "df['tweet'] = df['tweet'].apply(preprocess_text)\n",
        "\n",
        "# Split data into features and target\n",
        "X = df['tweet']\n",
        "y = df['label']\n",
        "\n",
        "# Initialize RandomForestClassifier\n",
        "rf = RandomForestClassifier()\n",
        "\n",
        "# Define hyperparameters grid\n",
        "param_grid = {\n",
        "    'rf__n_estimators': [50, 100, 200],\n",
        "    'rf__max_depth': [None, 10, 20],\n",
        "}\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Define pipeline\n",
        "pipeline = Pipeline([\n",
        "    ('vectorizer', TfidfVectorizer()),\n",
        "    ('rf', rf)\n",
        "])\n",
        "\n",
        "# Perform hyperparameter tuning\n",
        "grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search.fit(X, y)\n",
        "\n",
        "# Get best model and parameters\n",
        "best_model = grid_search.best_estimator_\n",
        "best_params = grid_search.best_params_\n",
        "best_score = grid_search.best_score_\n",
        "\n",
        "print(\"Best parameters:\", best_params)\n",
        "print(\"Best cross-validation accuracy:\", best_score)\n",
        "\n",
        "# Fit the best model\n",
        "best_model.fit(X, y)\n",
        "\n",
        "# Predict using the best model\n",
        "y_pred = best_model.predict(X)\n",
        "\n",
        "# Calculate accuracy\n",
        "accuracy = accuracy_score(y, y_pred)\n",
        "print(\"Final Accuracy:\", accuracy)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "UhM3zb7S-XI6",
        "outputId": "1934f8c1-a924-40d9-e840-f3c714e54e83"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
            "Best parameters: {'rf__max_depth': None, 'rf__n_estimators': 200}\n",
            "Best cross-validation accuracy: 0.6987\n",
            "Final Accuracy: 0.9955\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#بدون معالجة\n",
        "# Libraries\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import re\n",
        "import string\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.model_selection import train_test_split, KFold, GridSearchCV\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
        "import time\n",
        "from imblearn.over_sampling import RandomOverSampler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "nltk.download('stopwords')\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "# Read CSV file\n",
        "df = pd.read_csv('/content/drive/MyDrive/train (1).csv')\n",
        "print(df.head())  # Display the data\n",
        "\n",
        "# Split data into features and target (avoiding data leakage)\n",
        "X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)\n",
        "\n",
        "# Initialize KFold cross-validator\n",
        "kf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "# Function to display results in a DataFrame\n",
        "def display_results(results):\n",
        "    df_results = pd.DataFrame(results)\n",
        "    print(df_results)\n",
        "\n",
        "# Initialize an empty list to store results\n",
        "results = []\n",
        "\n",
        "# Define hyperparameters grid for Random Forest\n",
        "param_grid_rf = {\n",
        "  'classifier__n_estimators': [50, 100, 200],\n",
        "  'classifier__max_depth': [None, 10, 20]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Logistic Regression\n",
        "param_grid_lr = {\n",
        "  'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Support Vector Machine\n",
        "param_grid_svm = {\n",
        "  'classifier__C': [0.1, 1, 10],\n",
        "  'classifier__gamma': ['scale', 'auto']\n",
        "}\n",
        "\n",
        "# Define hyperparameters grid for Multinomial Naive Bayes\n",
        "param_grid_nb = {\n",
        "  'classifier__alpha': [0.1, 0.5, 1.0]\n",
        "}\n",
        "\n",
        "# Random Forest\n",
        "print(\"Tuning hyperparameters for Random Forest:\")\n",
        "pipeline_rf = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n",
        "    ('classifier', RandomForestClassifier())\n",
        "])\n",
        "grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_rf.fit(X_train, y_train)\n",
        "best_params_rf = grid_search_rf.best_params_\n",
        "best_score_rf = grid_search_rf.best_score_\n",
        "results.append({'Model': 'Random Forest', 'Best Parameters': best_params_rf, 'Best Cross-validation Accuracy': best_score_rf})\n",
        "\n",
        "# Logistic Regression\n",
        "print(\"Tuning hyperparameters for Logistic Regression:\")\n",
        "pipeline_lr = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n",
        "    ('classifier', LogisticRegression())\n",
        "])\n",
        "grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_lr.fit(X_train, y_train)\n",
        "best_params_lr = grid_search_lr.best_params_\n",
        "best_score_lr = grid_search_lr.best_score_\n",
        "results.append({'Model': 'Logistic Regression', 'Best Parameters': best_params_lr, 'Best Cross-validation Accuracy': best_score_lr})\n",
        "\n",
        "# Support Vector Machine\n",
        "print(\"Tuning hyperparameters for Support Vector Machine:\")\n",
        "pipeline_svm = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n",
        "    ('classifier', SVC())\n",
        "])\n",
        "grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_svm.fit(X_train, y_train)\n",
        "best_params_svm = grid_search_svm.best_params_\n",
        "best_score_svm = grid_search_svm.best_score_\n",
        "results.append({'Model': 'Support Vector Machine', 'Best Parameters': best_params_svm, 'Best Cross-validation Accuracy': best_score_svm})\n",
        "\n",
        "# Multinomial Naive Bayes\n",
        "print(\"Tuning hyperparameters for Multinomial Naive Bayes:\")\n",
        "pipeline_nb = Pipeline([\n",
        "    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),\n",
        "    ('classifier', MultinomialNB())\n",
        "])\n",
        "grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=kf, scoring='accuracy', n_jobs=-1)\n",
        "grid_search_nb.fit(X_train, y_train)\n",
        "best_params_nb = grid_search_nb.best_params_\n",
        "best_score_nb = grid_search_nb.best_score_\n",
        "results.append({'Model': 'Multinomial Naive Bayes', 'Best Parameters': best_params_nb, 'Best Cross-validation Accuracy': best_score_nb})\n",
        "\n",
        "# Display results\n",
        "display_results(results)\n",
        "\n",
        "# Evaluate on test data (including additional metrics)\n",
        "print(\"Performance on Test Set:\")\n",
        "for model_name, model in [('Random Forest', grid_search_rf), ('Logistic Regression', grid_search_lr), ('Support Vector Machine', grid_search_svm), ('Multinomial Naive Bayes', grid_search_nb)]:\n",
        "  y_pred = model.predict(X_test)\n",
        "  print(f\"Model: {model_name}\")\n",
        "  print(classification_report(y_test, y_pred))\n",
        "  print(confusion_matrix(y_test, y_pred))\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ABQvR4RLKbwq",
        "outputId": "3a5fdc25-85d3-43c1-b69d-6ae95fdce6bb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
            "                                               tweet  label\n",
            "0  BofA previews Netflixs NFLX Q3 Earnings Tues 0...      0\n",
            "1  I scooped a couple of shares this morning at a...      0\n",
            "2  Im streaming ES Futures using Bookmap on youtu...      0\n",
            "3             CF taking some off here close to 19150      1\n",
            "4  No change to this position is still bullish st...      0\n",
            "Tuning hyperparameters for Random Forest:\n",
            "Tuning hyperparameters for Logistic Regression:\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
            "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
            "\n",
            "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
            "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
            "Please also refer to the documentation for alternative solver options:\n",
            "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
            "  n_iter_i = _check_optimize_result(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tuning hyperparameters for Support Vector Machine:\n",
            "Tuning hyperparameters for Multinomial Naive Bayes:\n",
            "                     Model                                    Best Parameters  \\\n",
            "0            Random Forest  {'classifier__max_depth': None, 'classifier__n...   \n",
            "1      Logistic Regression                              {'classifier__C': 10}   \n",
            "2   Support Vector Machine  {'classifier__C': 10, 'classifier__gamma': 'sc...   \n",
            "3  Multinomial Naive Bayes                         {'classifier__alpha': 0.1}   \n",
            "\n",
            "   Best Cross-validation Accuracy  \n",
            "0                        0.686875  \n",
            "1                        0.700625  \n",
            "2                        0.705000  \n",
            "3                        0.695625  \n",
            "Performance on Test Set:\n",
            "Model: Random Forest\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.71      0.51      0.59       512\n",
            "           0       0.68      0.87      0.76       985\n",
            "           1       0.76      0.56      0.64       503\n",
            "\n",
            "    accuracy                           0.70      2000\n",
            "   macro avg       0.72      0.65      0.67      2000\n",
            "weighted avg       0.71      0.70      0.69      2000\n",
            "\n",
            "[[259 209  44]\n",
            " [ 81 860  44]\n",
            " [ 23 200 280]]\n",
            "Model: Logistic Regression\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.68      0.59      0.63       512\n",
            "           0       0.70      0.79      0.74       985\n",
            "           1       0.70      0.63      0.66       503\n",
            "\n",
            "    accuracy                           0.70      2000\n",
            "   macro avg       0.69      0.67      0.68      2000\n",
            "weighted avg       0.70      0.70      0.69      2000\n",
            "\n",
            "[[303 170  39]\n",
            " [115 774  96]\n",
            " [ 29 159 315]]\n",
            "Model: Support Vector Machine\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.70      0.55      0.61       512\n",
            "           0       0.68      0.84      0.75       985\n",
            "           1       0.78      0.58      0.66       503\n",
            "\n",
            "    accuracy                           0.70      2000\n",
            "   macro avg       0.72      0.65      0.67      2000\n",
            "weighted avg       0.71      0.70      0.69      2000\n",
            "\n",
            "[[281 206  25]\n",
            " [100 826  59]\n",
            " [ 23 190 290]]\n",
            "Model: Multinomial Naive Bayes\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "          -1       0.66      0.61      0.63       512\n",
            "           0       0.69      0.79      0.73       985\n",
            "           1       0.69      0.55      0.62       503\n",
            "\n",
            "    accuracy                           0.68      2000\n",
            "   macro avg       0.68      0.65      0.66      2000\n",
            "weighted avg       0.68      0.68      0.68      2000\n",
            "\n",
            "[[311 159  42]\n",
            " [128 776  81]\n",
            " [ 32 192 279]]\n"
          ]
        }
      ]
    }
  ]
}