{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c6866894",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7b293125",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1ddac164d1df40438dddfddf1730f471",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/2312 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n",
    "from datasets import Dataset, DatasetDict\n",
    "import torch\n",
    "\n",
    "# Load dataset\n",
    "data = pd.read_csv('C:/Users/Administrator/Downloads/ds_2300_Sheet1.csv')\n",
    "\n",
    "# Remove 'id' column\n",
    "data = data.drop(columns=['id'])\n",
    "\n",
    "# Adding a dummy label column (ensure it's an integer type)\n",
    "data['label'] = 0\n",
    "\n",
    "# Convert label column to integer type\n",
    "data['label'] = data['label'].astype(float)\n",
    "\n",
    "# Convert to Hugging Face dataset\n",
    "dataset = Dataset.from_pandas(data)\n",
    "\n",
    "# Loading pre-trained uncased multilingual BERT model and tokenizer\n",
    "model_name = 'bert-base-multilingual-uncased'\n",
    "tokenizer = BertTokenizer.from_pretrained(model_name)\n",
    "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Adjust num_labels if needed\n",
    "\n",
    "# Tokenization function\n",
    "def tokenize_function(examples):\n",
    "    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)  # Adjust max_length if needed\n",
    "\n",
    "# Tokenize the dataset\n",
    "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
    "\n",
    "# Split the dataset\n",
    "split_datasets = tokenized_datasets.train_test_split(test_size=0.1)\n",
    "train_dataset = split_datasets['train']\n",
    "eval_dataset = split_datasets['test']\n",
    "\n",
    "# Convert train and eval datasets to PyTorch tensors and ensure labels are Long tensors\n",
    "def format_dataset(dataset):\n",
    "    return dataset.with_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n",
    "\n",
    "train_dataset = format_dataset(train_dataset)\n",
    "eval_dataset = format_dataset(eval_dataset)\n",
    "\n",
    "# Define training arguments\n",
    "training_args = TrainingArguments(\n",
    "    output_dir='./results',\n",
    "    evaluation_strategy='epoch',\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=8,\n",
    "    per_device_eval_batch_size=8,\n",
    "    num_train_epochs=5,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "# Define Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    eval_dataset=eval_dataset,\n",
    "    tokenizer=tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d533c43",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='187' max='1300' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [ 187/1300 17:25 < 1:44:52, 0.18 it/s, Epoch 0.72/5]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Train the model\n",
    "trainer.train()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ca0b7d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate the model\n",
    "eval_results = trainer.evaluate()\n",
    "print(\"Evaluation Results:\", eval_results)\n",
    "\n",
    "# Save the model\n",
    "model.save_pretrained('./fine-tuned-bert-urdu')\n",
    "tokenizer.save_pretrained('./fine-tuned-bert-urdu')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}