In [None]:
!pip install transformers torch scikit-learn pandas



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
data_file = "/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv"
df = pd.read_csv(data_file)
df.dropna(subset=['category'], inplace=True)
# df = df.drop(df[df['category'] == 0].index)
df['category']=[int(i) for i in df['category']]
df['category']=[2 if i==1 else i for i in df['category']]
df['category']=[1 if i==0 else i for i in df['category']]
df['category']=[0 if i==-1 else i for i in df['category']]
df['clean_text']=[str(i) for i in df['clean_text']]
df=df.sample(10000)
texts = df['clean_text'].tolist()
labels=df['category'].tolist()

In [None]:
df.shape

(10000, 2)

In [None]:
import random
random.choices(texts)

['mean seriously there should reality show how shameless one can become launched bjp contested bjp with narendra modi leading contestant nation will love going history elections guess all love touch drama']

In [None]:
random.sample(labels,5)

[0, 0, 1, 1, 1]

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict_sentiments(texts, model, tokenizer, device, max_length=128):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for text in texts:
            encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)  # Assuming logits is the output of your model
            label = preds.item()

            all_predictions.append(label)

    positive_percentage = (sum(1 for label in all_predictions if label == 2) / len(all_predictions)) * 100
    neutral_percentage = (sum(1 for label in all_predictions if label == 1) / len(all_predictions)) * 100
    negative_percentage = (sum(1 for label in all_predictions if label == 0) / len(all_predictions)) * 100

    return positive_percentage, neutral_percentage, negative_percentage

In [None]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 3
max_length = 128
batch_size = 16
num_epochs = 1
learning_rate = 2e-5

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
random.sample(train_labels,9)

[0, 2, 1, 1, 0, 2, 2, 1, 0]

In [None]:
# Specify a cache directory for the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name, cache_dir="/path/to/cache/directory")

# Rest of your code remains the same
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/1
Validation Accuracy: 0.7180
              precision    recall  f1-score   support

           0       0.61      0.55      0.58       465
           1       0.74      0.74      0.74       650
           2       0.76      0.78      0.77       885

    accuracy                           0.72      2000
   macro avg       0.70      0.69      0.70      2000
weighted avg       0.72      0.72      0.72      2000



In [None]:
torch.save(model.state_dict(), "bert_classifier_three_labeled.pth")

In [None]:
test_texts = [
    "PM Modi's unwavering dedication to economic development and his efforts to uplift the marginalized communities are truly commendable.",
    "I'm not sure how I feel about this.",
    "This is a negative statement about the situation.",
    "Feeling positive about the upcoming event!",
    "Neutral statement to test the model."
]

positive_percent, neutral_percent, negative_percent = predict_sentiments(test_texts, model, tokenizer, device)
print(f"Positive Percentage: {positive_percent:.2f}%")
print(f"Neutral Percentage: {neutral_percent:.2f}%")
print(f"Negative Percentage: {negative_percent:.2f}%")

Positive Percentage: 40.00%
Neutral Percentage: 60.00%
Negative Percentage: 0.00%


In [None]:
import joblib
from transformers import BertForSequenceClassification, BertTokenizer

# Example: Load or initialize your BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example: Train your BERT model (replace this with your actual training code)

# Save the model and tokenizer using joblib
joblib.dump(model, 'bert_classifier_model.pkl')
joblib.dump(tokenizer, 'bert_classifier_tokenizer.pkl')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['bert_classifier_tokenizer.pkl']