{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", "import pandas as pd\n", "import torch" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 One of the other reviewers has mentioned that ...\n", "1 A wonderful little production.

The...\n", "2 I thought this was a wonderful way to spend ti...\n", "3 Basically there's a family where a little boy ...\n", "4 Petter Mattei's \"Love in the Time of Money\" is...\n", " ... \n", "495 \"American Nightmare\" is officially tied, in my...\n", "496 First off, I have to say that I loved the book...\n", "497 This movie was extremely boring. I only laughe...\n", "498 I was disgusted by this movie. No it wasn't be...\n", "499 Such a joyous world has been created for us in...\n", "Name: review, Length: 500, dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reviews = pd.read_csv('../Datasets/IMDB Dataset.csv')\n", "reviews = reviews.head(500)[\"review\"]\n", "reviews" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelscore
0POSITIVE0.508178
1POSITIVE0.521151
2POSITIVE0.528036
3POSITIVE0.517413
4POSITIVE0.520384
.........
495POSITIVE0.528022
496POSITIVE0.512645
497POSITIVE0.524352
498POSITIVE0.503319
499POSITIVE0.526241
\n", "

500 rows × 2 columns

\n", "
" ], "text/plain": [ " label score\n", "0 POSITIVE 0.508178\n", "1 POSITIVE 0.521151\n", "2 POSITIVE 0.528036\n", "3 POSITIVE 0.517413\n", "4 POSITIVE 0.520384\n", ".. ... ...\n", "495 POSITIVE 0.528022\n", "496 POSITIVE 0.512645\n", "497 POSITIVE 0.524352\n", "498 POSITIVE 0.503319\n", "499 POSITIVE 0.526241\n", "\n", "[500 rows x 2 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load tokenizer and model\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\").to(\"cuda\")\n", "\n", "results = []\n", "for review in reviews:\n", " # Tokenize with truncation and padding to max length\n", " tokenized_review = tokenizer(review, return_tensors=\"pt\", truncation=True, padding=\"max_length\", max_length=512).to(\"cuda\")\n", "\n", " # Get the model's output (logits)\n", " with torch.no_grad():\n", " outputs = model(**tokenized_review)\n", " \n", " # Convert logits to probabilities\n", " probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)\n", " \n", " # Get the predicted label and score\n", " score = probabilities.max().item()\n", " label = \"POSITIVE\" if torch.argmax(probabilities).item() == 1 else \"NEGATIVE\"\n", " \n", " # Append the result\n", " results.append({\"label\": label, \"score\": score})\n", "\n", "# Create DataFrame\n", "sentiment_df = pd.DataFrame(results)\n", "sentiment_df\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "NLP_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }