{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Pr7vJKcKd4gr", "outputId": "0f6cc3c7-24d7-49d0-dc9d-26bc45a64cdd" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.1.0+cu118)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.4)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.2.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2023.6.0)\n", "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.1.0)\n", "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.11.3)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.2.0)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.3.post1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n" ] } ], "source": [ "!pip install transformers torch scikit-learn pandas" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tH2zIdlZg5cF", "outputId": "3490654d-09c5-4270-966d-6e5aba363443" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6L-RdDLGUDNv" }, "outputs": [], "source": [ "import os\n", "import torch\n", "from torch import nn\n", "from torch.utils.data import DataLoader, Dataset\n", "from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RlR2_pMvU5iF" }, "outputs": [], "source": [ "data_file = \"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\"\n", "df = pd.read_csv(data_file)\n", "df.dropna(subset=['category'], inplace=True)\n", "# df = df.drop(df[df['category'] == 0].index)\n", "df['category']=[int(i) for i in df['category']]\n", "df['category']=[2 if i==1 else i for i in df['category']]\n", "df['category']=[1 if i==0 else i for i in df['category']]\n", "df['category']=[0 if i==-1 else i for i in df['category']]\n", "df['clean_text']=[str(i) for i in df['clean_text']]\n", "df=df.sample(10000)\n", "texts = df['clean_text'].tolist()\n", "labels=df['category'].tolist()" ] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bGwnFQgjaAeO", "outputId": "9a416f57-06b2-4c89-e16a-9d670260bb58" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(10000, 2)" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rQOk57pXU6gS", "outputId": "e8e861fe-1fb9-4467-9718-8d497c72eff9" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['mean seriously there should reality show how shameless one can become launched bjp contested bjp with narendra modi leading contestant nation will love going history elections guess all love touch drama']" ] }, "metadata": {}, "execution_count": 25 } ], "source": [ "import random\n", "random.choices(texts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oiRLdjU8U9CG", "outputId": "84e61a8b-2f0e-46e7-8650-a0f0febae12d" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[0, 0, 1, 1, 1]" ] }, "metadata": {}, "execution_count": 26 } ], "source": [ "random.sample(labels,5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pDcj8B3uVA9X" }, "outputs": [], "source": [ "class TextClassificationDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_length):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", " def __len__(self):\n", " return len(self.texts)\n", " def __getitem__(self, idx):\n", " text = self.texts[idx]\n", " label = self.labels[idx]\n", " encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)\n", " return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Uz3IzNx6VSoX" }, "outputs": [], "source": [ "class BERTClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTClassifier, self).__init__()\n", " self.bert = BertModel.from_pretrained(bert_model_name)\n", " self.dropout = nn.Dropout(0.1)\n", " self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " pooled_output = outputs.pooler_output\n", " x = self.dropout(pooled_output)\n", " logits = self.fc(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JoI_Ya8RVVBO" }, "outputs": [], "source": [ "def train(model, data_loader, optimizer, scheduler, device):\n", " model.train()\n", " for batch in data_loader:\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " loss = nn.CrossEntropyLoss()(outputs, labels)\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "E9nHv8IsVXJw" }, "outputs": [], "source": [ "def evaluate(model, data_loader, device):\n", " model.eval()\n", " predictions = []\n", " actual_labels = []\n", " with torch.no_grad():\n", " for batch in data_loader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " _, preds = torch.max(outputs, dim=1)\n", " predictions.extend(preds.cpu().tolist())\n", " actual_labels.extend(labels.cpu().tolist())\n", " return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kbrYHrYtVX5o" }, "outputs": [], "source": [ "def predict_sentiments(texts, model, tokenizer, device, max_length=128):\n", " model.eval()\n", " all_predictions = []\n", "\n", " with torch.no_grad():\n", " for text in texts:\n", " encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)\n", " input_ids = encoding['input_ids'].to(device)\n", " attention_mask = encoding['attention_mask'].to(device)\n", "\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " _, preds = torch.max(outputs, dim=1) # Assuming logits is the output of your model\n", " label = preds.item()\n", "\n", " all_predictions.append(label)\n", "\n", " positive_percentage = (sum(1 for label in all_predictions if label == 2) / len(all_predictions)) * 100\n", " neutral_percentage = (sum(1 for label in all_predictions if label == 1) / len(all_predictions)) * 100\n", " negative_percentage = (sum(1 for label in all_predictions if label == 0) / len(all_predictions)) * 100\n", "\n", " return positive_percentage, neutral_percentage, negative_percentage" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "in1DCI9BVaEJ" }, "outputs": [], "source": [ "# Set up parameters\n", "bert_model_name = 'bert-base-uncased'\n", "num_classes = 3\n", "max_length = 128\n", "batch_size = 16\n", "num_epochs = 1\n", "learning_rate = 2e-5" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wAPXY-B5VdUQ" }, "outputs": [], "source": [ "train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qNrconhaVjLw", "outputId": "12d7a846-542b-42dc-8f7e-3bfae9c2a66c" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[0, 2, 1, 1, 0, 2, 2, 1, 0]" ] }, "metadata": {}, "execution_count": 34 } ], "source": [ "random.sample(train_labels,9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1mxmBNGbVf8L" }, "outputs": [], "source": [ "# Specify a cache directory for the tokenizer\n", "tokenizer = BertTokenizer.from_pretrained(bert_model_name, cache_dir=\"/path/to/cache/directory\")\n", "\n", "# Rest of your code remains the same\n", "train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)\n", "val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=batch_size)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EM8AiMHnVnlh" }, "outputs": [], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model = BERTClassifier(bert_model_name, num_classes).to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XHukmDqSVo0c", "outputId": "cf8ca79c-9e3a-44c2-ca87-9be30b1ca42c" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] } ], "source": [ "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oDz6E1voVrlZ", "outputId": "16c3d9b0-1982-482f-9739-c2b81b95e420" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/1\n", "Validation Accuracy: 0.7180\n", " precision recall f1-score support\n", "\n", " 0 0.61 0.55 0.58 465\n", " 1 0.74 0.74 0.74 650\n", " 2 0.76 0.78 0.77 885\n", "\n", " accuracy 0.72 2000\n", " macro avg 0.70 0.69 0.70 2000\n", "weighted avg 0.72 0.72 0.72 2000\n", "\n" ] } ], "source": [ "for epoch in range(num_epochs):\n", " print(f\"Epoch {epoch + 1}/{num_epochs}\")\n", " train(model, train_dataloader, optimizer, scheduler, device)\n", " accuracy, report = evaluate(model, val_dataloader, device)\n", " print(f\"Validation Accuracy: {accuracy:.4f}\")\n", " print(report)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "u78uMbu2VslK" }, "outputs": [], "source": [ "torch.save(model.state_dict(), \"bert_classifier_three_labeled.pth\")" ] }, { "cell_type": "code", "source": [ "\n" ], "metadata": { "id": "YQK0UZRdElRa" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uXZNj4pMVus0", "outputId": "91bbb8fc-b29f-4bb3-ec83-686565deb849" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Positive Percentage: 40.00%\n", "Neutral Percentage: 60.00%\n", "Negative Percentage: 0.00%\n" ] } ], "source": [ "test_texts = [\n", " \"PM Modi's unwavering dedication to economic development and his efforts to uplift the marginalized communities are truly commendable.\",\n", " \"I'm not sure how I feel about this.\",\n", " \"This is a negative statement about the situation.\",\n", " \"Feeling positive about the upcoming event!\",\n", " \"Neutral statement to test the model.\"\n", "]\n", "\n", "positive_percent, neutral_percent, negative_percent = predict_sentiments(test_texts, model, tokenizer, device)\n", "print(f\"Positive Percentage: {positive_percent:.2f}%\")\n", "print(f\"Neutral Percentage: {neutral_percent:.2f}%\")\n", "print(f\"Negative Percentage: {negative_percent:.2f}%\")" ] }, { "cell_type": "code", "source": [ "import joblib\n", "from transformers import BertForSequenceClassification, BertTokenizer\n", "\n", "# Example: Load or initialize your BERT model\n", "model = BertForSequenceClassification.from_pretrained('bert-base-uncased')\n", "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", "\n", "# Example: Train your BERT model (replace this with your actual training code)\n", "\n", "# Save the model and tokenizer using joblib\n", "joblib.dump(model, 'bert_classifier_model.pkl')\n", "joblib.dump(tokenizer, 'bert_classifier_tokenizer.pkl')\n" ], "metadata": { "id": "Dene5QabGch6", "outputId": "19708822-836f-46fa-8976-137274b33217", "colab": { "base_uri": "https://localhost:8080/", "height": 185, "referenced_widgets": [ "d9a25586dd834e42b99b5c274cd0935d", "0303b03e9cab4a8b9ba2dddd87a4facd", "e0b75fa139a64528ae9bc55fc06bc5ea", "8cf0ca3cf5bd4b60b7a04f4118159fe2", "cf9746188523473fa0dc39301e282340", "d6906e77d9b14d278784e9cd9c3a6cf6", "d043e4555bb849fc9f3c92da52603eda", "563365543d6e4bbc9c28bf5bb4133790", "f61ad0ff939d415e93e3d39e3e0097fb", "0c7866fccea940f4b315d39520ca474f", "893498ae4ec648b394d3784795432def", "f671208d412746fe80ea07026ee3bc00", "c95faa4e2a0343dc82194553d77bcd58", "b74261eb3702403b854c587ea5911677", "03260f0e829049d3bb5e56d4dbe46ad9", "453838b6f94248bcba56a6f6c7000a2c", "c160773d7d664ff4957cadefe7817980", "92c2deca172940418a0451ef81b753e3", "d7699394ab574c6d9375a413962dbbc6", "23b9ac612d294d91b100b2520c824d9d", "3614df9bd02548f09d9da169693a0373", "b022cc0f17c54c5aaf8e9e0722a627b3", "571644d3b69e416295335a4e0d71c846", "9dbf2ecb4e41422d8e8647d55bfcaeb7", "704509f29b4440b2a26f89810cbd0f93", "7c9e12515b8c48578abe0870b99aa6c4", "1d77c5ff1fb0427bbfa4d74d630b3440", "8c27fdca2d954ac0b3bff48c4a21c9f4", "e1af2c93d89c42b6af4f71c6221edaa2", "50a2bc1e58b34de39493fcc99d7906d8", "bc7ceea7d04e4511af7c923b4380702e", "2c248b5e372e4eeca557dcfbca329c2f", "d563e6343b564accb9c972bf6a573d74" ] } }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "tokenizer_config.json: 0%| | 0.00/28.0 [00:00