{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
    "import pandas as pd\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      One of the other reviewers has mentioned that ...\n",
       "1      A wonderful little production. <br /><br />The...\n",
       "2      I thought this was a wonderful way to spend ti...\n",
       "3      Basically there's a family where a little boy ...\n",
       "4      Petter Mattei's \"Love in the Time of Money\" is...\n",
       "                             ...                        \n",
       "495    \"American Nightmare\" is officially tied, in my...\n",
       "496    First off, I have to say that I loved the book...\n",
       "497    This movie was extremely boring. I only laughe...\n",
       "498    I was disgusted by this movie. No it wasn't be...\n",
       "499    Such a joyous world has been created for us in...\n",
       "Name: review, Length: 500, dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews = pd.read_csv('../Datasets/IMDB Dataset.csv')\n",
    "reviews = reviews.head(500)[\"review\"]\n",
    "reviews"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.508178</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.521151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.528036</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.517413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.520384</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>495</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.528022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.512645</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.524352</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.503319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>POSITIVE</td>\n",
       "      <td>0.526241</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        label     score\n",
       "0    POSITIVE  0.508178\n",
       "1    POSITIVE  0.521151\n",
       "2    POSITIVE  0.528036\n",
       "3    POSITIVE  0.517413\n",
       "4    POSITIVE  0.520384\n",
       "..        ...       ...\n",
       "495  POSITIVE  0.528022\n",
       "496  POSITIVE  0.512645\n",
       "497  POSITIVE  0.524352\n",
       "498  POSITIVE  0.503319\n",
       "499  POSITIVE  0.526241\n",
       "\n",
       "[500 rows x 2 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load tokenizer and model\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\").to(\"cuda\")\n",
    "\n",
    "results = []\n",
    "for review in reviews:\n",
    "    # Tokenize with truncation and padding to max length\n",
    "    tokenized_review = tokenizer(review, return_tensors=\"pt\", truncation=True, padding=\"max_length\", max_length=512).to(\"cuda\")\n",
    "\n",
    "    # Get the model's output (logits)\n",
    "    with torch.no_grad():\n",
    "        outputs = model(**tokenized_review)\n",
    "    \n",
    "    # Convert logits to probabilities\n",
    "    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
    "    \n",
    "    # Get the predicted label and score\n",
    "    score = probabilities.max().item()\n",
    "    label = \"POSITIVE\" if torch.argmax(probabilities).item() == 1 else \"NEGATIVE\"\n",
    "    \n",
    "    # Append the result\n",
    "    results.append({\"label\": label, \"score\": score})\n",
    "\n",
    "# Create DataFrame\n",
    "sentiment_df = pd.DataFrame(results)\n",
    "sentiment_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "NLP_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}