SaiKrishna-KK
/

HuggingFace-DFLLM

English

Model card Files Files and versions Community

SaiKrishna-KK commited on Apr 1, 2023

Commit

4c581ac

1 Parent(s): 83d4169

Implement masked language modeling using DistilBERT and TensorFlow

Browse files

Files changed (1) hide show

HuggingFace_distilbert_base_cased.ipynb +533 -532

HuggingFace_distilbert_base_cased.ipynb CHANGED Viewed

@@ -1,385 +1,21 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU",
-    "gpuClass": "standard",
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "5e84b1c5dd89477ca5fdc599967753a7": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HBoxModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_af99709e68244dc5a7c398a16d0431b2",
-              "IPY_MODEL_98607a2ca7164fcbbb6e5b66f448a80d",
-              "IPY_MODEL_15c94c1e140e4212b5324d865ac89ff6"
-            ],
-            "layout": "IPY_MODEL_20192eab1c574b6e97d1fac97a35d037"
-          }
-        },
-        "af99709e68244dc5a7c398a16d0431b2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_7196d8c5e35f4cbcb8a731324538dcf5",
-            "placeholder": "",
-            "style": "IPY_MODEL_376942a321ec42b9aedd70f75ab531ad",
-            "value": "100%"
-          }
-        },
-        "98607a2ca7164fcbbb6e5b66f448a80d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "FloatProgressModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_cfbf71772b7643aca0d03bad779b4d26",
-            "max": 3,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_58edb323ca6042b6a744cefecceca979",
-            "value": 3
-          }
-        },
-        "15c94c1e140e4212b5324d865ac89ff6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "HTMLModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_47639638321f45abb64cc5666ee3ae0a",
-            "placeholder": "",
-            "style": "IPY_MODEL_4cf84c560d9e48369a741cf622fcd80d",
-            "value": " 3/3 [00:00&lt;00:00, 106.19it/s]"
-          }
-        },
-        "20192eab1c574b6e97d1fac97a35d037": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "7196d8c5e35f4cbcb8a731324538dcf5": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "376942a321ec42b9aedd70f75ab531ad": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "cfbf71772b7643aca0d03bad779b4d26": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "58edb323ca6042b6a744cefecceca979": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "ProgressStyleModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "47639638321f45abb64cc5666ee3ae0a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_name": "LayoutModel",
-          "model_module_version": "1.2.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "4cf84c560d9e48369a741cf622fcd80d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_name": "DescriptionStyleModel",
-          "model_module_version": "1.5.0",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        }
-      }
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
       "source": [
         "#Hugging-Face Internship Problem:\n",
         "1. Start by loading the wikitext-2-raw-v1 version of that dataset, and take the 11th example (index 10) of the train split.\n",
         "2. We'll tokenize this using the appropriate tokenizer, and we'll mask the sixth token (index 5) the sequence.\n",
         "\n",
         "- Q. When using the distilbert-base-cased checkpoint to unmask that (sixth token, index 5) token, what is the most probable predicted token (please provide the decoded token, and not the ID)?"
-      ],
-      "metadata": {
-        "id": "hlxV8KZIRliB"
-      }
     },
     {
       "cell_type": "code",
-      "source": [
-        "!pip install transformers"
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -387,11 +23,10 @@
         "id": "Sd4IxsstOyk2",
         "outputId": "90053639-6270-465f-8242-c01618957260"
       },
-      "execution_count": 1,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
           "text": [
             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (4.27.4)\n",
@@ -411,13 +46,14 @@
             "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n"
           ]
         }
       ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "!pip install datasets"
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -425,11 +61,10 @@
         "id": "-RKugi1hO672",
         "outputId": "9358d11a-0915-408a-d17e-fc06c052e560"
       },
-      "execution_count": 2,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
           "text": [
             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Requirement already satisfied: datasets in /usr/local/lib/python3.9/dist-packages (2.11.0)\n",
@@ -464,10 +99,16 @@
             "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n"
           ]
         }
       ]
     },
     {
       "cell_type": "markdown",
       "source": [
         "##Algorithimic Approach\n",
         "\n",
@@ -508,206 +149,566 @@
         "1. Print the most probable predicted token\n",
         "\n",
         "This algorithm outlines the steps necessary to predict the masked token in a given text using the pretrained DistilBERT model from the Hugging Face Transformers library with TensorFlow. It starts by importing the required libraries and modules, loading the pretrained tokenizer and model, and preprocessing the text. Then, it predicts the masked token, finds the most probable token, decodes it, and outputs the result.\n"
-      ],
-      "metadata": {
-        "id": "Y87qgrjFQOm1"
-      }
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "Import the required libraries and modules: Import TensorFlow, TFDistilBertForMaskedLM (DistilBERT model for masked language modeling), and DistilBertTokenizer (tokenizer for the DistilBERT model)."
-      ],
       "metadata": {
         "id": "_RckY6c4SvyY"
-      }
     },
     {
       "cell_type": "code",
       "source": [
         "import tensorflow as tf\n",
         "from transformers import TFDistilBertForMaskedLM, DistilBertTokenizer"
-      ],
       "metadata": {
-        "id": "PHTLRdu4SSJS"
       },
-      "execution_count": 3,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
       "source": [
-        "Load the tokenizer and the model: Instantiate the DistilBertTokenizer and TFDistilBertForMaskedLM using the 'distilbert-base-cased' pretrained model. The tokenizer is used to convert text into tokens, while the model is used to predict the masked token."
       ],
       "metadata": {
-        "id": "K8rNDpJpS450"
-      }
     },
     {
       "cell_type": "code",
       "source": [
-        "# Load the tokenizer and the model\n",
-        "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')\n",
-        "model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')"
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "id": "ybWe9vMnSSdN",
-        "outputId": "9f26ee41-42a0-48c7-e4a1-d3e41aa6afc6"
       },
-      "execution_count": 4,
       "outputs": [
         {
           "output_type": "stream",
-          "name": "stderr",
           "text": [
-            "Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']\n",
-            "- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-            "- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-            "All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-cased.\n",
-            "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.\n"
           ]
         }
       ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "Load the wikitext-2-raw-v1 dataset: Using the Hugging Face Datasets library, load the 'wikitext-2-raw-v1' dataset and extract the eleventh example (index 10) from the train split."
-      ],
       "metadata": {
-        "id": "-Ulr3fMhS9ZQ"
-      }
-    },
-    {
-      "cell_type": "code",
       "source": [
-        "# Load wikitext-2-raw-v1 dataset\n",
-        "from datasets import load_dataset\n",
-        "wikitext_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')\n",
-        "eleventh_example = wikitext_dataset['train'][10]['text']"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 66,
-          "referenced_widgets": [
-            "5e84b1c5dd89477ca5fdc599967753a7",
-            "af99709e68244dc5a7c398a16d0431b2",
-            "98607a2ca7164fcbbb6e5b66f448a80d",
-            "15c94c1e140e4212b5324d865ac89ff6",
-            "20192eab1c574b6e97d1fac97a35d037",
-            "7196d8c5e35f4cbcb8a731324538dcf5",
-            "376942a321ec42b9aedd70f75ab531ad",
-            "cfbf71772b7643aca0d03bad779b4d26",
-            "58edb323ca6042b6a744cefecceca979",
-            "47639638321f45abb64cc5666ee3ae0a",
-            "4cf84c560d9e48369a741cf622fcd80d"
-          ]
         },
-        "id": "GZSGMS8NSS66",
-        "outputId": "8fbcf043-c7af-4264-98c9-b557c22b7b80"
-      },
-      "execution_count": 5,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "WARNING:datasets.builder:Found cached dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n"
-          ]
         },
-        {
-          "output_type": "display_data",
-          "data": {
-            "text/plain": [
-              "  0%|          | 0/3 [00:00<?, ?it/s]"
             ],
-            "application/vnd.jupyter.widget-view+json": {
-              "version_major": 2,
-              "version_minor": 0,
-              "model_id": "5e84b1c5dd89477ca5fdc599967753a7"
-            }
-          },
-          "metadata": {}
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Tokenize the text and mask the sixth token: Use the tokenizer to convert the eleventh example into tokens. Replace the sixth token (index 5) with the mask token from the tokenizer. Then, join the tokens into a single string and tokenize it again using the tokenizer, preparing the input tensors for the model."
-      ],
-      "metadata": {
-        "id": "udOrZFymTDJc"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Tokenize the text and mask the sixth token (index 5)\n",
-        "tokens = tokenizer.tokenize(eleventh_example)\n",
-        "tokens[5] = tokenizer.mask_token\n",
-        "masked_text = ' '.join(tokens)\n",
-        "inputs = tokenizer(masked_text, return_tensors='tf')"
-      ],
-      "metadata": {
-        "id": "tPiIhTSASTHz"
-      },
-      "execution_count": 6,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Use the model to predict the masked token\n",
-        "outputs = model(inputs)\n",
-        "predictions = tf.nn.softmax(outputs.logits, axis=-1)\n",
-        "masked_token_predictions = predictions[0, 5, :]\n",
-        "top_token_id = tf.argmax(masked_token_predictions).numpy()"
-      ],
-      "metadata": {
-        "id": "SDnjXKKRSZyw"
-      },
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Decode the most probable token\n",
-        "decoded_token = tokenizer.decode([top_token_id])\n",
-        "print(f\"The most probable predicted token is: {decoded_token}\")\n"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
         },
-        "id": "kcDUy0H6QUHQ",
-        "outputId": "19562b3c-00d8-43ee-ebc2-ab574fdd0fbe"
-      },
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "The most probable predicted token is: battle\n"
-          ]
         }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "##Solution:\n",
-        "The most probable predicted token when using the distilbert-base-cased checkpoint to unmask the sixth token (index 5) of the provided sequence is \"national\"."
-      ],
-      "metadata": {
-        "id": "DBbt0oegRhmQ"
       }
     }
-  ]
-}

 {
   "cells": [
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "hlxV8KZIRliB"
+      },
       "source": [
         "#Hugging-Face Internship Problem:\n",
         "1. Start by loading the wikitext-2-raw-v1 version of that dataset, and take the 11th example (index 10) of the train split.\n",
         "2. We'll tokenize this using the appropriate tokenizer, and we'll mask the sixth token (index 5) the sequence.\n",
         "\n",
         "- Q. When using the distilbert-base-cased checkpoint to unmask that (sixth token, index 5) token, what is the most probable predicted token (please provide the decoded token, and not the ID)?"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 1,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         "id": "Sd4IxsstOyk2",
         "outputId": "90053639-6270-465f-8242-c01618957260"
       },
       "outputs": [
         {
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (4.27.4)\n",
             "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n"
           ]
         }
+      ],
+      "source": [
+        "!pip install transformers"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": 2,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         "id": "-RKugi1hO672",
         "outputId": "9358d11a-0915-408a-d17e-fc06c052e560"
       },
       "outputs": [
         {
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Requirement already satisfied: datasets in /usr/local/lib/python3.9/dist-packages (2.11.0)\n",
             "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n"
           ]
         }
+      ],
+      "source": [
+        "!pip install datasets"
       ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Y87qgrjFQOm1"
+      },
       "source": [
         "##Algorithimic Approach\n",
         "\n",
         "1. Print the most probable predicted token\n",
         "\n",
         "This algorithm outlines the steps necessary to predict the masked token in a given text using the pretrained DistilBERT model from the Hugging Face Transformers library with TensorFlow. It starts by importing the required libraries and modules, loading the pretrained tokenizer and model, and preprocessing the text. Then, it predicts the masked token, finds the most probable token, decodes it, and outputs the result.\n"
+      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "id": "_RckY6c4SvyY"
+      },
+      "source": [
+        "Import the required libraries and modules: Import TensorFlow, TFDistilBertForMaskedLM (DistilBERT model for masked language modeling), and DistilBertTokenizer (tokenizer for the DistilBERT model)."
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "PHTLRdu4SSJS"
+      },
+      "outputs": [],
       "source": [
         "import tensorflow as tf\n",
         "from transformers import TFDistilBertForMaskedLM, DistilBertTokenizer"
+      ]
+    },
+    {
+      "cell_type": "markdown",
       "metadata": {
+        "id": "K8rNDpJpS450"
       },
+      "source": [
+        "Load the tokenizer and the model: Instantiate the DistilBertTokenizer and TFDistilBertForMaskedLM using the 'distilbert-base-cased' pretrained model. The tokenizer is used to convert text into tokens, while the model is used to predict the masked token."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ybWe9vMnSSdN",
+        "outputId": "9f26ee41-42a0-48c7-e4a1-d3e41aa6afc6"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']\n",
+            "- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-cased.\n",
+            "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Load the tokenizer and the model\n",
+        "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')\n",
+        "model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "-Ulr3fMhS9ZQ"
+      },
       "source": [
+        "Load the wikitext-2-raw-v1 dataset: Using the Hugging Face Datasets library, load the 'wikitext-2-raw-v1' dataset and extract the eleventh example (index 10) from the train split."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 66,
+          "referenced_widgets": [
+            "5e84b1c5dd89477ca5fdc599967753a7",
+            "af99709e68244dc5a7c398a16d0431b2",
+            "98607a2ca7164fcbbb6e5b66f448a80d",
+            "15c94c1e140e4212b5324d865ac89ff6",
+            "20192eab1c574b6e97d1fac97a35d037",
+            "7196d8c5e35f4cbcb8a731324538dcf5",
+            "376942a321ec42b9aedd70f75ab531ad",
+            "cfbf71772b7643aca0d03bad779b4d26",
+            "58edb323ca6042b6a744cefecceca979",
+            "47639638321f45abb64cc5666ee3ae0a",
+            "4cf84c560d9e48369a741cf622fcd80d"
+          ]
+        },
+        "id": "GZSGMS8NSS66",
+        "outputId": "8fbcf043-c7af-4264-98c9-b557c22b7b80"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "WARNING:datasets.builder:Found cached dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "5e84b1c5dd89477ca5fdc599967753a7",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "  0%|          | 0/3 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
       ],
+      "source": [
+        "# Load wikitext-2-raw-v1 dataset\n",
+        "from datasets import load_dataset\n",
+        "wikitext_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')\n",
+        "eleventh_example = wikitext_dataset['train'][10]['text']"
+      ]
+    },
+    {
+      "cell_type": "markdown",
       "metadata": {
+        "id": "udOrZFymTDJc"
+      },
+      "source": [
+        "Tokenize the text and mask the sixth token: Use the tokenizer to convert the eleventh example into tokens. Replace the sixth token (index 5) with the mask token from the tokenizer. Then, join the tokens into a single string and tokenize it again using the tokenizer, preparing the input tensors for the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "tPiIhTSASTHz"
+      },
+      "outputs": [],
+      "source": [
+        "# Tokenize the text and mask the sixth token (index 5)\n",
+        "tokens = tokenizer.tokenize(eleventh_example)\n",
+        "tokens[5] = tokenizer.mask_token\n",
+        "masked_text = ' '.join(tokens)\n",
+        "inputs = tokenizer(masked_text, return_tensors='tf')"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "SDnjXKKRSZyw"
+      },
+      "outputs": [],
       "source": [
+        "# Use the model to predict the masked token\n",
+        "outputs = model(inputs)\n",
+        "predictions = tf.nn.softmax(outputs.logits, axis=-1)\n",
+        "masked_token_predictions = predictions[0, 5, :]\n",
+        "top_token_id = tf.argmax(masked_token_predictions).numpy()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
+        "id": "kcDUy0H6QUHQ",
+        "outputId": "19562b3c-00d8-43ee-ebc2-ab574fdd0fbe"
       },
       "outputs": [
         {
+          "name": "stdout",
           "output_type": "stream",
           "text": [
+            "The most probable predicted token is: battle\n"
           ]
         }
+      ],
+      "source": [
+        "# Decode the most probable token\n",
+        "decoded_token = tokenizer.decode([top_token_id])\n",
+        "print(f\"The most probable predicted token is: {decoded_token}\")\n"
       ]
     },
     {
+      "attachments": {},
       "cell_type": "markdown",
       "metadata": {
+        "id": "DBbt0oegRhmQ"
+      },
       "source": [
+        "##Solution:\n",
+        "The most probable predicted token when using the distilbert-base-cased checkpoint to unmask the sixth token (index 5) of the provided sequence is \"battle\"."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "15c94c1e140e4212b5324d865ac89ff6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_47639638321f45abb64cc5666ee3ae0a",
+            "placeholder": "",
+            "style": "IPY_MODEL_4cf84c560d9e48369a741cf622fcd80d",
+            "value": " 3/3 [00:00&lt;00:00, 106.19it/s]"
+          }
         },
+        "20192eab1c574b6e97d1fac97a35d037": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "376942a321ec42b9aedd70f75ab531ad": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "47639638321f45abb64cc5666ee3ae0a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4cf84c560d9e48369a741cf622fcd80d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "58edb323ca6042b6a744cefecceca979": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
         },
+        "5e84b1c5dd89477ca5fdc599967753a7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_af99709e68244dc5a7c398a16d0431b2",
+              "IPY_MODEL_98607a2ca7164fcbbb6e5b66f448a80d",
+              "IPY_MODEL_15c94c1e140e4212b5324d865ac89ff6"
             ],
+            "layout": "IPY_MODEL_20192eab1c574b6e97d1fac97a35d037"
+          }
         },
+        "7196d8c5e35f4cbcb8a731324538dcf5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "98607a2ca7164fcbbb6e5b66f448a80d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_cfbf71772b7643aca0d03bad779b4d26",
+            "max": 3,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_58edb323ca6042b6a744cefecceca979",
+            "value": 3
+          }
+        },
+        "af99709e68244dc5a7c398a16d0431b2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7196d8c5e35f4cbcb8a731324538dcf5",
+            "placeholder": "",
+            "style": "IPY_MODEL_376942a321ec42b9aedd70f75ab531ad",
+            "value": "100%"
+          }
+        },
+        "cfbf71772b7643aca0d03bad779b4d26": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
         }
       }
     }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}