MohitRajput45's picture
first commit
af09308
import pandas as pd
import os
import torch
import numpy as np # NEW: Needed for math operations on arrays
from torch import nn # NEW: Needed for the custom loss function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight # NEW: Calculates the penalty weights
from sklearn.metrics import accuracy_score, f1_score # NEW: The strict F1 grading system
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
def train_mindguard_model():
print("🚀 Initializing MindGuard Training Pipeline...")
# --- BULLETPROOF PATHING ---
# 1. Find exactly where this train.py script lives
script_dir = os.path.dirname(os.path.abspath(__file__))
# 2. Go up two folders (from src/core_model) to find the project root
project_root = os.path.abspath(os.path.join(script_dir, "../../"))
# 3. Define the exact absolute paths
data_path = os.path.join(project_root, "data", "processed", "master_training_data.csv")
artifacts_dir = os.path.join(project_root, "artifacts", "xlmr_weights")
# Ensure artifacts directory exists
os.makedirs(artifacts_dir, exist_ok=True)
print(f"Loading data from {data_path}...")
df = pd.read_csv(data_path, on_bad_lines='skip')
# --- THE FIX: Pandas parsing safety ---
df = df.dropna(subset=['text', 'label'])
df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(str)
# --- THE FIX: Data Sanitizer ---
# 1. Drop any rows where the label is just a number (e.g., "0" or "1")
df = df[~df['label'].str.isnumeric()]
# 2. Drop the corrupted 'admi' label
df = df[df['label'] != 'admi']
# ⚠️ UNCOMMENT THE LINE BELOW if you don't have a strong GPU and want to do a fast 2-minute test run!
# df = df.sample(500, random_state=42)
# 2. Convert text labels (e.g., 'Anxiety') to numbers (e.g., 0, 1, 2)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
num_labels = len(label_encoder.classes_)
# Save the label mapping
mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print(f"Detected {num_labels} unique emotions: {mapping}")
# 3. Split the data into Training (80%) and Testing (20%)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
# --- NEW: CALCULATE PENALTY WEIGHTS FOR IMBALANCED DATA ---
print("⚖️ Calculating Class Weights for Imbalanced Data...")
unique_classes = np.unique(train_df['label_encoded'])
weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=train_df['label_encoded'])
# Automatically detect if a GPU is available locally, otherwise use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class_weights_tensor = torch.tensor(weights, dtype=torch.float).to(device)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
# 4. Load the XLM-RoBERTa Tokenizer
print("Loading XLM-RoBERTa Tokenizer...")
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
# Function to convert text into numbers
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
# Apply tokenization to both datasets
print("Tokenizing the datasets (converting words to numbers)...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
# Rename label column so the Trainer understands it
tokenized_train = tokenized_train.rename_column("label_encoded", "labels")
tokenized_val = tokenized_val.rename_column("label_encoded", "labels")
# --- THE FIX: Strip out English text so PyTorch only sees numbers ---
tokenized_train = tokenized_train.remove_columns(["text", "label"])
tokenized_val = tokenized_val.remove_columns(["text", "label"])
# Formally convert them to PyTorch tensors
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")
# 5. Load the Deep Learning Model
print("Loading XLM-RoBERTa Neural Network...")
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)
# --- NEW: STRICT SCORING METRICS ---
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
# F1 Macro forces the AI to prove it learned the rare emotions, not just 'Normal'
f1 = f1_score(labels, preds, average='macro')
acc = accuracy_score(labels, preds)
return {'accuracy': acc, 'f1_macro': f1}
# 6. Set up the Training Rules
training_args = TrainingArguments(
output_dir=artifacts_dir, # Uses absolute path
eval_strategy="epoch", # Test the model at the end of every round
learning_rate=3e-5, # UPDATED: Slightly higher to help learn rare classes
per_device_train_batch_size=16, # How many sentences to look at once
num_train_epochs=5, # UPDATED: 5 epochs to give more time to study hard emotions
warmup_steps=500, # NEW: Gentle warmup to prevent wild guessing early on
weight_decay=0.01,
save_strategy="epoch",
metric_for_best_model="f1_macro", # NEW: Tell AI to prioritize F1 over basic accuracy
load_best_model_at_end=True # NEW: Automatically save the smartest brain
# overwrite_output_dir=True,
)
# --- NEW: CUSTOM TRAINER OVERRIDE ---
class ImbalancedTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
# Grab the actual answers
labels = inputs.pop("labels")
# Make a prediction
outputs = model(**inputs)
logits = outputs.logits
# Calculate the error using our Custom Penalty Weights!
loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
# 7. Start Training!
trainer = ImbalancedTrainer( # UPDATED: Using the strict custom trainer
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
compute_metrics=compute_metrics # NEW: Attach the strict grader
)
print("🔥 Starting actual model training! (This might take a while depending on your computer)...")
trainer.train()
# 8. Save the final model
final_model_dir = os.path.join(artifacts_dir, "final_mindguard_model")
print(f"✅ Training complete. Saving the brain to {final_model_dir}...")
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
if __name__ == "__main__":
train_mindguard_model()