{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"fbbc510b0e1548748409b6c016bdaa24": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_5b95f4951d0b44bb99d2b76f54556511",
"IPY_MODEL_031b30d2c7d7402f8c63dd11c83bc690",
"IPY_MODEL_38b1d5d5328249e2b830e9847cf407e1"
],
"layout": "IPY_MODEL_30e4917df33c4f4fbd7813ab864f6b76"
}
},
"5b95f4951d0b44bb99d2b76f54556511": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b470b1d8fb274a6fb933ecab0c18fa12",
"placeholder": "",
"style": "IPY_MODEL_6eba305524bb4ea08d36eff08f34fa75",
"value": "100%"
}
},
"031b30d2c7d7402f8c63dd11c83bc690": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_57519ea679234aa18d9cb7e44b8181a9",
"max": 3,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_ae126db234f34079a2bf79dbabd624a3",
"value": 3
}
},
"38b1d5d5328249e2b830e9847cf407e1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3fce668226424a70bd0afcc725d68739",
"placeholder": "",
"style": "IPY_MODEL_b810c87d8aec4c96acb20f4563777ddc",
"value": " 3/3 [00:00<00:00, 37.60it/s]"
}
},
"30e4917df33c4f4fbd7813ab864f6b76": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b470b1d8fb274a6fb933ecab0c18fa12": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6eba305524bb4ea08d36eff08f34fa75": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"57519ea679234aa18d9cb7e44b8181a9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ae126db234f34079a2bf79dbabd624a3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"3fce668226424a70bd0afcc725d68739": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b810c87d8aec4c96acb20f4563777ddc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"2691abf9765e4cf18a142cfc10ee5d2f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_2d8281fc5bc6411a898e4c31687462d9",
"IPY_MODEL_831f795de5d54f32bbd821bcafb16675",
"IPY_MODEL_fc06fd3b96124f498f041cff507fe00a"
],
"layout": "IPY_MODEL_5bc4c971d82b47eaad865379fa9c381c"
}
},
"2d8281fc5bc6411a898e4c31687462d9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9469012a76194346b83c438ee64b620e",
"placeholder": "",
"style": "IPY_MODEL_874e1e2f9f5341a4bed55ea0ae4b8cc1",
"value": "100%"
}
},
"831f795de5d54f32bbd821bcafb16675": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_038876ce0b0d456ab5c4f30f8f498dbe",
"max": 68,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_5102475328e84e15b3d178dc599cc93a",
"value": 68
}
},
"fc06fd3b96124f498f041cff507fe00a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a2a210635b0a479d837de6312410032c",
"placeholder": "",
"style": "IPY_MODEL_5162637d8a9d4fc0824c3cd6dd19cf83",
"value": " 68/68 [00:04<00:00, 14.34ba/s]"
}
},
"5bc4c971d82b47eaad865379fa9c381c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9469012a76194346b83c438ee64b620e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"874e1e2f9f5341a4bed55ea0ae4b8cc1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"038876ce0b0d456ab5c4f30f8f498dbe": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5102475328e84e15b3d178dc599cc93a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"a2a210635b0a479d837de6312410032c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5162637d8a9d4fc0824c3cd6dd19cf83": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"I am planning to work on **SST-2 (Stanford Sentiment Treebank)** dataset.
\n",
"https://nlp.stanford.edu/sentiment/index.html
\n",
"https://paperswithcode.com/dataset/sst
\n",
"https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary"
],
"metadata": {
"id": "dZXodUszVbwl"
}
},
{
"cell_type": "markdown",
"source": [
"In this dataset each phrase is labelled as either negative or positive. There is a SST-5 dataset as well in which each phrase is labelled as negative, somewhat negative, neutral, somewhat positive or positive. "
],
"metadata": {
"id": "2XpoSGi03q3v"
}
},
{
"cell_type": "code",
"source": [
"!pip install datasets\n",
"!pip install transformers"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "99CDEisB8zlD",
"outputId": "139321a9-de14-4002-9463-700783940f6c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: datasets in /usr/local/lib/python3.7/dist-packages (2.5.2)\n",
"Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.1)\n",
"Requirement already satisfied: dill<0.3.6 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.5.1)\n",
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (5.0.0)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets) (3.0.0)\n",
"Requirement already satisfied: huggingface-hub<1.0.0,>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.10.0)\n",
"Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.18.0)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets) (3.8.3)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n",
"Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n",
"Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n",
"Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (2022.8.2)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n",
"Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.13)\n",
"Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.1.1)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (4.1.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (22.1.0)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.8.1)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (4.0.2)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.2.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (6.0.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (1.3.1)\n",
"Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (0.13.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (3.8.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (6.0)\n",
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.9)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.25.11)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2022.9.24)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.1)\n",
"Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.4)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.22.2)\n",
"Requirement already satisfied: tokenizers!=0.11.3,<0.13,>=0.11.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.12.1)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.8.0)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n",
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (5.0.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (6.0)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.0)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.1)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.9.0->transformers) (4.1.1)\n",
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.9)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.8.1)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.25.11)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2022.9.24)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from datasets import load_dataset\n",
"from transformers import DistilBertTokenizerFast, TFAutoModelForSequenceClassification"
],
"metadata": {
"id": "9Y_5TS7i8bdN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"There are many ways to load the dataset, for example using tensorflow_datasets (https://www.tensorflow.org/datasets/api_docs/python/tfds/load), but I am planning to use datasets package"
],
"metadata": {
"id": "mVILRmQeU7zH"
}
},
{
"cell_type": "code",
"source": [
"data_sst2 = load_dataset(\"glue\", \"sst2\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 87,
"referenced_widgets": [
"fbbc510b0e1548748409b6c016bdaa24",
"5b95f4951d0b44bb99d2b76f54556511",
"031b30d2c7d7402f8c63dd11c83bc690",
"38b1d5d5328249e2b830e9847cf407e1",
"30e4917df33c4f4fbd7813ab864f6b76",
"b470b1d8fb274a6fb933ecab0c18fa12",
"6eba305524bb4ea08d36eff08f34fa75",
"57519ea679234aa18d9cb7e44b8181a9",
"ae126db234f34079a2bf79dbabd624a3",
"3fce668226424a70bd0afcc725d68739",
"b810c87d8aec4c96acb20f4563777ddc"
]
},
"id": "zA2dObmZJYNc",
"outputId": "7beb9f22-344a-4319-be19-3cd905672da1"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"WARNING:datasets.builder:Found cached dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/3 [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "fbbc510b0e1548748409b6c016bdaa24"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"data_sst2"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "opKFVLJeJqlM",
"outputId": "85fa6285-d54a-4f66-a265-f70e41d95117"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['sentence', 'label', 'idx'],\n",
" num_rows: 67349\n",
" })\n",
" validation: Dataset({\n",
" features: ['sentence', 'label', 'idx'],\n",
" num_rows: 872\n",
" })\n",
" test: Dataset({\n",
" features: ['sentence', 'label', 'idx'],\n",
" num_rows: 1821\n",
" })\n",
"})"
]
},
"metadata": {},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"source": [
"data_sst2['train'][0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wgODW_Y9JusF",
"outputId": "fc9e22a0-0a10-468b-a33e-a90629669723"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'sentence': 'hide new secretions from the parental units ',\n",
" 'label': 0,\n",
" 'idx': 0}"
]
},
"metadata": {},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"source": [
"data_sst2['train'].features"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oA0i0XHoSYlN",
"outputId": "4fa68594-f67c-43ee-dbdc-35de92056176"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'sentence': Value(dtype='string', id=None),\n",
" 'label': ClassLabel(num_classes=2, names=['negative', 'positive'], id=None),\n",
" 'idx': Value(dtype='int32', id=None)}"
]
},
"metadata": {},
"execution_count": 21
}
]
},
{
"cell_type": "markdown",
"source": [
"Refrence: https://huggingface.co/docs/transformers/index, \n",
"https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb\n",
"\n",
"we need to preprocess our text. Tokenization and preprocessing is generally based on the model architecture you use. \n",
"\n",
"Let's use pretrained distilbert. We can use huggingface transformer library. We can also train our tokenizer from scratch.\n",
"\n",
"we can use AutoTokenizer.from_pretrained as well instead of below function"
],
"metadata": {
"id": "74nHWqmVMdrj"
}
},
{
"cell_type": "code",
"source": [
"tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')"
],
"metadata": {
"id": "nAIQlwoaLaVn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def preprocess(data):\n",
" return tokenizer(data['sentence'], truncation=True)"
],
"metadata": {
"id": "8yFKkFimRe92"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"we can use the map method of our dataset object to apply above function on all datapoints of all splits.\n",
"\n",
"Note that we passed batched=True to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently."
],
"metadata": {
"id": "xCorwRwbS_y_"
}
},
{
"cell_type": "code",
"source": [
"dataset_enc = data_sst2.map(preprocess, batched=True)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 106,
"referenced_widgets": [
"2691abf9765e4cf18a142cfc10ee5d2f",
"2d8281fc5bc6411a898e4c31687462d9",
"831f795de5d54f32bbd821bcafb16675",
"fc06fd3b96124f498f041cff507fe00a",
"5bc4c971d82b47eaad865379fa9c381c",
"9469012a76194346b83c438ee64b620e",
"874e1e2f9f5341a4bed55ea0ae4b8cc1",
"038876ce0b0d456ab5c4f30f8f498dbe",
"5102475328e84e15b3d178dc599cc93a",
"a2a210635b0a479d837de6312410032c",
"5162637d8a9d4fc0824c3cd6dd19cf83"
]
},
"id": "CniAzqtmRvV6",
"outputId": "22a98839-c74e-4697-f87a-0221b976b807"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/68 [00:00, ?ba/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "2691abf9765e4cf18a142cfc10ee5d2f"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7ab0de81d13d7bdc.arrow\n",
"WARNING:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b7cf8f8990ed6f2f.arrow\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"dataset_enc[\"train\"].features"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9ARVOUICRNh2",
"outputId": "b14ae1a4-7821-4f89-f429-d19aa6e30b08"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'sentence': Value(dtype='string', id=None),\n",
" 'label': ClassLabel(num_classes=2, names=['negative', 'positive'], id=None),\n",
" 'idx': Value(dtype='int32', id=None),\n",
" 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),\n",
" 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}"
]
},
"metadata": {},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"source": [
"dataset_enc[\"train\"][0]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nAb1QG1QgD7j",
"outputId": "e7811016-8db1-44a9-f446-3eb49fbbc3c2"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'sentence': 'hide new secretions from the parental units ',\n",
" 'label': 0,\n",
" 'idx': 0,\n",
" 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],\n",
" 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
]
},
"metadata": {},
"execution_count": 27
}
]
},
{
"cell_type": "code",
"source": [
"dataset_enc[\"train\"].features[\"label\"]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bkSac7FpTWZb",
"outputId": "905d1a9f-3eec-4940-8702-f9578a6b3256"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ClassLabel(num_classes=2, names=['negative', 'positive'], id=None)"
]
},
"metadata": {},
"execution_count": 28
}
]
},
{
"cell_type": "markdown",
"source": [
"Convert datasets to tf.data.Dataset, so that Keras can understand it."
],
"metadata": {
"id": "1gcN67-XObot"
}
},
{
"cell_type": "code",
"source": [
"model = TFAutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3NAAESdaUBNX",
"outputId": "27be4f23-01a2-423b-a79a-5c7fb7882278"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'vocab_layer_norm', 'activation_13']\n",
"- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_39', 'pre_classifier']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"tf_train_dataset = model.prepare_tf_dataset(\n",
" dataset_enc[\"train\"],\n",
" shuffle=True,\n",
" batch_size=64,\n",
" tokenizer=tokenizer\n",
")\n",
"\n",
"tf_validation_dataset = model.prepare_tf_dataset(\n",
" dataset_enc[\"validation\"],\n",
" shuffle=False,\n",
" batch_size=64,\n",
" tokenizer=tokenizer,\n",
")\n",
"\n",
"tf_validation_test = model.prepare_tf_dataset(\n",
" dataset_enc[\"test\"],\n",
" shuffle=False,\n",
" batch_size=64,\n",
" tokenizer=tokenizer,\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9oW2wFSUOAaF",
"outputId": "6dedf3f0-5838-4968-a6a6-c6bb25a62a9d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# WE can use tf_train_dataset and tf_validation_dataset in model.fit"
],
"metadata": {
"id": "VTQQqzGgUp7R"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "AHKEdT-KU1yt"
},
"execution_count": null,
"outputs": []
}
]
}