from google.colab import drive
drive.mount('/content/drive')

import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RlR2_pMvU5iF" }, "outputs": [], "source": [ "data_file = \"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\"\n", "df = pd.read_csv(data_file)\n", "df.dropna(subset=['category'], inplace=True)\n", "# df = df.drop(df[df['category'] == 0].index)\n", "df['category']=[int(i) for i in df['category']]\n", "df['category']=[2 if i==1 else i for i in df['category']]\n", "df['category']=[1 if i==0 else i for i in df['category']]\n", "df['category']=[0 if i==-1 else i for i in df['category']]\n", "df['clean_text']=[str(i) for i in df['clean_text']]\n", "df=df.sample(10000)\n", "texts = df['clean_text'].tolist()\n", "labels=df['category'].tolist()" ] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Uz3IzNx6VSoX" }, "outputs": [], "source": [ "class BERTClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTClassifier, self).__init__()\n", " self.bert = BertModel.from_pretrained(bert_model_name)\n", " self.dropout = nn.Dropout(0.1)\n", " self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " pooled_output = outputs.pooler_output\n", " x = self.dropout(pooled_output)\n", " logits = self.fc(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JoI_Ya8RVVBO" }, "outputs": [], "source": [ "def train(model, data_loader, optimizer, scheduler, device):\n", " model.train()\n", " for batch in data_loader:\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " loss = nn.CrossEntropyLoss()(outputs, labels)\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "E9nHv8IsVXJw" }, "outputs": [], "source": [ "def evaluate(model, data_loader, device):\n", " model.eval()\n", " predictions = []\n", " actual_labels = []\n", " with torch.no_grad():\n", " for batch in data_loader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " _, preds = torch.max(outputs, dim=1)\n", " predictions.extend(preds.cpu().tolist())\n", " actual_labels.extend(labels.cpu().tolist())\n", " return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kbrYHrYtVX5o" }, "outputs": [], "source": [ "def predict_sentiments(texts, model, tokenizer, device, max_length=128):\n", " model.eval()\n", " all_predictions = []\n", "\n", " with torch.no_grad():\n", " for text in texts:\n", " encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)\n", " input_ids = encoding['input_ids'].to(device)\n", " attention_mask = encoding['attention_mask'].to(device)\n", "\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " _, preds = torch.max(outputs, dim=1) # Assuming logits is the output of your model\n", " label = preds.item()\n", "\n", " all_predictions.append(label)\n", "\n", " positive_percentage = (sum(1 for label in all_predictions if label == 2) / len(all_predictions)) * 100\n", " neutral_percentage = (sum(1 for label in all_predictions if label "data": { "text/plain": [ "[0, 2, 1, 1, 0, 2, 2, 1, 0]" ] }, "metadata": {}, "execution_count": 34 } ], "source": [ "random.sample(train_labels,9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1mxmBNGbVf8L" }, "outputs": [], "source": [ "# Specify a cache directory for the tokenizer\n", "tokenizer = BertTokenizer.from_pretrained(bert_model_name, cache_dir=\"/path/to/cache/directory\")\n", "\n", "# Rest of your code remains the same\n", "train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)\n", "val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=batch_size)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EM8AiMHnVnlh" }, "outputs": [], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] } ], "source": [ "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oDz6E1voVrlZ", "outputId": "16c3d9b0-1982-482f-9739-c2b81b95e420" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/1\n", "Validation Accuracy: 0.7180\n", " precision recall f1-score support\n", "\n", " 0 0.61 0.55 0.58 465\n", " 1 0.74 0.74 0.74 650\n", " 2 0.76 0.78 0.77 885\n", "\n", " accuracy 0.72 2000\n", " macro avg 0.70 0.69 0.70 2000\n", "weighted avg 0.72 0.72 0.72 2000\n", "\n" ] } ], "source": [ "for epoch in range(num_epochs):\n", " print(f\"Epoch {epoch + 1}/{num_epochs}\")\n", " train(model, train_dataloader, optimizer, scheduler, device)\n", " accuracy, report = evaluate(model, val_dataloader, device)\n", " print(f\"Validation Accuracy: {accuracy:.4f}\")\n", " print(report)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "u78uMbu2VslK" }, "outputs": [], "source": [ "torch.save(model.state_dict(), \"bert_classifier_three_labeled.pth\")" ] }, { "cell_type": "code", "source": [ "\n" ], "metadata": { "id": "YQK0UZRdElRa" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uXZNj4pMVus0", "outputId": "91bbb8fc-b29f-4bb3-ec83-686565deb849" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Positive Percentage: 40.00%\n", "Neutral Percentage: 60.00%\n", "Negative Percentage: 0.00%\n" ] } ], "source": [ "test_texts = [\n", " \"PM Modi's unwavering dedication to economic development and his efforts to uplift the marginalized communities are truly commendable.\",\n", " \"I'm not sure how I feel about this.\",\n", " \"This is a negative statement about the situation.\",\n", " \"Feeling positive about the upcoming event!\",\n", " \"Neutral statement to test the model.\"\n", "]\n", "\n", "positive_percent, neutral_percent, negative_percent = predict_sentiments(test_texts, model, tokenizer, device)\n", "print(f\"Positive Percentage: {positive_percent:.2f}%\")\n", "print(f\"Neutral Percentage: {neutral_percent:.2f}%\")\n", "print(f\"Negative Percentage: {negative_percent:.2f}%\")" ] }, { "cell_type": "code", "source": [ "import joblib\n", "from transformers import BertForSequenceClassification, BertTokenizer\n", "\n", "# Example: Load or initialize your BERT model\n", "model = BertForSequenceClassification.from_pretrained('bert-base-uncased')\n", "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", "\n", "# Example: Train your BERT 