File size: 11,443 Bytes

5584c5d

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LlamaForSequenceClassification(\n",
       "  (model): LlamaModel(\n",
       "    (embed_tokens): Embedding(49153, 576, padding_idx=49152)\n",
       "    (layers): ModuleList(\n",
       "      (0-29): 30 x LlamaDecoderLayer(\n",
       "        (self_attn): LlamaSdpaAttention(\n",
       "          (q_proj): Linear(in_features=576, out_features=576, bias=False)\n",
       "          (k_proj): Linear(in_features=576, out_features=192, bias=False)\n",
       "          (v_proj): Linear(in_features=576, out_features=192, bias=False)\n",
       "          (o_proj): Linear(in_features=576, out_features=576, bias=False)\n",
       "          (rotary_emb): LlamaRotaryEmbedding()\n",
       "        )\n",
       "        (mlp): LlamaMLP(\n",
       "          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)\n",
       "          (up_proj): Linear(in_features=576, out_features=1536, bias=False)\n",
       "          (down_proj): Linear(in_features=1536, out_features=576, bias=False)\n",
       "          (act_fn): SiLU()\n",
       "        )\n",
       "        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)\n",
       "        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)\n",
       "      )\n",
       "    )\n",
       "    (norm): LlamaRMSNorm((576,), eps=1e-05)\n",
       "    (rotary_emb): LlamaRotaryEmbedding()\n",
       "  )\n",
       "  (score): Linear(in_features=576, out_features=2, bias=False)\n",
       ")"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import GPT2Tokenizer, LlamaForSequenceClassification\n",
    "\n",
    "# Load the GPT2 tokenizer and Llama model for sequence classification\n",
    "model_path = r\"C:\\Users\\jatin\\OneDrive\\Desktop\\plagiarism-detection\\smolLM-fined-tuned-for-PLAGAIRISM-Detection\\model\"\n",
    "tokenizer = GPT2Tokenizer.from_pretrained(model_path, local_files_only=True)\n",
    "model = LlamaForSequenceClassification.from_pretrained(model_path, local_files_only=True)\n",
    "\n",
    "# Set model to evaluation mode\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentence1</th>\n",
       "      <th>sentence2</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A person on a horse jumps over a broken down a...</td>\n",
       "      <td>A person is at a diner, ordering an omelette.</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A person on a horse jumps over a broken down a...</td>\n",
       "      <td>A person is outdoors, on a horse.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Children smiling and waving at camera</td>\n",
       "      <td>There are children present</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Children smiling and waving at camera</td>\n",
       "      <td>The kids are frowning</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A boy is jumping on skateboard in the middle o...</td>\n",
       "      <td>The boy skates down the sidewalk.</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           sentence1  \\\n",
       "0  A person on a horse jumps over a broken down a...   \n",
       "1  A person on a horse jumps over a broken down a...   \n",
       "2              Children smiling and waving at camera   \n",
       "3              Children smiling and waving at camera   \n",
       "4  A boy is jumping on skateboard in the middle o...   \n",
       "\n",
       "                                       sentence2  label  \n",
       "0  A person is at a diner, ordering an omelette.      0  \n",
       "1              A person is outdoors, on a horse.      1  \n",
       "2                     There are children present      1  \n",
       "3                          The kids are frowning      0  \n",
       "4              The boy skates down the sidewalk.      0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"train_snli.txt\", delimiter='\\t', header=None, names=['sentence1', 'sentence2', 'label'])\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.utils.data import Dataset, DataLoader\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "class PlagiarismDataset(Dataset):\n",
    "    def __init__(self, df, tokenizer, max_length=128):\n",
    "        self.df = df\n",
    "        self.tokenizer = tokenizer\n",
    "        self.max_length = max_length\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.df)\n",
    "\n",
    "    def __getitem__(self, index):\n",
    "        row = self.df.iloc[index]\n",
    "\n",
    "        # Ensure the sentences are strings; convert or skip if not\n",
    "        sentence1 = str(row['sentence1']) if not pd.isna(row['sentence1']) else \"\"\n",
    "        sentence2 = str(row['sentence2']) if not pd.isna(row['sentence2']) else \"\"\n",
    "\n",
    "        inputs = self.tokenizer(\n",
    "            sentence1, sentence2,\n",
    "            add_special_tokens=True,\n",
    "            max_length=self.max_length,\n",
    "            padding='max_length',\n",
    "            truncation=True,\n",
    "            return_tensors=\"pt\"\n",
    "        )\n",
    "\n",
    "        label = torch.tensor(row['label'], dtype=torch.long)\n",
    "\n",
    "        return {\n",
    "            'input_ids': inputs['input_ids'].squeeze(0),\n",
    "            'attention_mask': inputs['attention_mask'].squeeze(0),\n",
    "            'label': label\n",
    "        }\n",
    "\n",
    "def collate_fn(batch):\n",
    "    input_ids = torch.stack([item['input_ids'] for item in batch])\n",
    "    attention_masks = torch.stack([item['attention_mask'] for item in batch])\n",
    "    labels = torch.stack([item['label'] for item in batch])\n",
    "\n",
    "    return {\n",
    "        'input_ids': input_ids,\n",
    "        'attention_mask': attention_masks,\n",
    "        'label': labels\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "device(type='cuda')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assuming you have a separate test set or validation set (e.g., df_test)\n",
    "df_test = df[3_66_900:]\n",
    "# Add padding token if not already\n",
    "tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
    "\n",
    "# Resize the model's token embeddings to fit the new tokenizer\n",
    "model.resize_token_embeddings(len(tokenizer))\n",
    "\n",
    "# Create DataLoader for the test set\n",
    "test_dataset = PlagiarismDataset(df_test, tokenizer)\n",
    "test_data_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Classification Report:\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "           0       1.00      1.00      1.00       236\n",
      "           1       1.00      1.00      1.00       237\n",
      "\n",
      "    accuracy                           1.00       473\n",
      "   macro avg       1.00      1.00      1.00       473\n",
      "weighted avg       1.00      1.00      1.00       473\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "# Function to evaluate model on the test set\n",
    "# Set up device\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "# Move model to the appropriate device\n",
    "model = model.to(device)\n",
    "\n",
    "# Function to evaluate the model\n",
    "def evaluate_model(model, data_loader):\n",
    "    model.eval()  # Set model to evaluation mode\n",
    "    preds_list = []\n",
    "    labels_list = []\n",
    "\n",
    "    with torch.no_grad():  # Disable gradient calculation for evaluation\n",
    "        for batch in data_loader:\n",
    "            # Move input tensors to the same device as the model\n",
    "            input_ids = batch['input_ids'].to(device)\n",
    "            attention_mask = batch['attention_mask'].to(device)\n",
    "            labels = batch['label'].to(device)\n",
    "            \n",
    "            # Get model outputs\n",
    "            outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n",
    "            preds = torch.argmax(outputs.logits, dim=1)\n",
    "\n",
    "            # Append predictions and true labels to respective lists\n",
    "            preds_list.extend(preds.cpu().numpy())\n",
    "            labels_list.extend(labels.cpu().numpy())\n",
    "    \n",
    "    # Compute evaluation metrics\n",
    "    from sklearn.metrics import classification_report\n",
    "    report = classification_report(labels_list, preds_list)\n",
    "    print(\"Classification Report:\\n\", report)\n",
    "\n",
    "# Evaluate the model\n",
    "evaluate_model(model, test_data_loader)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "LLM",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}