Spaces:

MohitRajput45
/

MindGuard-AI

Sleeping

App Files Files Community

MindGuard-AI / src /core_model /train.py

MohitRajput45

first commit

af09308 24 days ago

raw

history blame contribute delete

7.34 kB

	import pandas as pd
	import os
	import torch
	import numpy as np # NEW: Needed for math operations on arrays
	from torch import nn # NEW: Needed for the custom loss function
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder
	from sklearn.utils.class_weight import compute_class_weight # NEW: Calculates the penalty weights
	from sklearn.metrics import accuracy_score, f1_score # NEW: The strict F1 grading system
	from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
	from datasets import Dataset

	def train_mindguard_model():
	print("🚀 Initializing MindGuard Training Pipeline...")

	# --- BULLETPROOF PATHING ---
	# 1. Find exactly where this train.py script lives
	script_dir = os.path.dirname(os.path.abspath(__file__))

	# 2. Go up two folders (from src/core_model) to find the project root
	project_root = os.path.abspath(os.path.join(script_dir, "../../"))

	# 3. Define the exact absolute paths
	data_path = os.path.join(project_root, "data", "processed", "master_training_data.csv")
	artifacts_dir = os.path.join(project_root, "artifacts", "xlmr_weights")

	# Ensure artifacts directory exists
	os.makedirs(artifacts_dir, exist_ok=True)

	print(f"Loading data from {data_path}...")
	df = pd.read_csv(data_path, on_bad_lines='skip')

	# --- THE FIX: Pandas parsing safety ---
	df = df.dropna(subset=['text', 'label'])
	df['text'] = df['text'].astype(str)
	df['label'] = df['label'].astype(str)

	# --- THE FIX: Data Sanitizer ---
	# 1. Drop any rows where the label is just a number (e.g., "0" or "1")
	df = df[~df['label'].str.isnumeric()]
	# 2. Drop the corrupted 'admi' label
	df = df[df['label'] != 'admi']

	# ⚠️ UNCOMMENT THE LINE BELOW if you don't have a strong GPU and want to do a fast 2-minute test run!
	# df = df.sample(500, random_state=42)

	# 2. Convert text labels (e.g., 'Anxiety') to numbers (e.g., 0, 1, 2)
	label_encoder = LabelEncoder()
	df['label_encoded'] = label_encoder.fit_transform(df['label'])
	num_labels = len(label_encoder.classes_)

	# Save the label mapping
	mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
	print(f"Detected {num_labels} unique emotions: {mapping}")

	# 3. Split the data into Training (80%) and Testing (20%)
	train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

	# --- NEW: CALCULATE PENALTY WEIGHTS FOR IMBALANCED DATA ---
	print("⚖️ Calculating Class Weights for Imbalanced Data...")
	unique_classes = np.unique(train_df['label_encoded'])
	weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=train_df['label_encoded'])

	# Automatically detect if a GPU is available locally, otherwise use CPU
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	class_weights_tensor = torch.tensor(weights, dtype=torch.float).to(device)

	train_dataset = Dataset.from_pandas(train_df)
	val_dataset = Dataset.from_pandas(val_df)

	# 4. Load the XLM-RoBERTa Tokenizer
	print("Loading XLM-RoBERTa Tokenizer...")
	tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

	# Function to convert text into numbers
	def tokenize_function(examples):
	return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

	# Apply tokenization to both datasets
	print("Tokenizing the datasets (converting words to numbers)...")
	tokenized_train = train_dataset.map(tokenize_function, batched=True)
	tokenized_val = val_dataset.map(tokenize_function, batched=True)

	# Rename label column so the Trainer understands it
	tokenized_train = tokenized_train.rename_column("label_encoded", "labels")
	tokenized_val = tokenized_val.rename_column("label_encoded", "labels")

	# --- THE FIX: Strip out English text so PyTorch only sees numbers ---
	tokenized_train = tokenized_train.remove_columns(["text", "label"])
	tokenized_val = tokenized_val.remove_columns(["text", "label"])

	# Formally convert them to PyTorch tensors
	tokenized_train.set_format("torch")
	tokenized_val.set_format("torch")

	# 5. Load the Deep Learning Model
	print("Loading XLM-RoBERTa Neural Network...")
	model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)

	# --- NEW: STRICT SCORING METRICS ---
	def compute_metrics(pred):
	labels = pred.label_ids
	preds = pred.predictions.argmax(-1)
	# F1 Macro forces the AI to prove it learned the rare emotions, not just 'Normal'
	f1 = f1_score(labels, preds, average='macro')
	acc = accuracy_score(labels, preds)
	return {'accuracy': acc, 'f1_macro': f1}

	# 6. Set up the Training Rules
	training_args = TrainingArguments(
	output_dir=artifacts_dir, # Uses absolute path
	eval_strategy="epoch", # Test the model at the end of every round
	learning_rate=3e-5, # UPDATED: Slightly higher to help learn rare classes
	per_device_train_batch_size=16, # How many sentences to look at once
	num_train_epochs=5, # UPDATED: 5 epochs to give more time to study hard emotions
	warmup_steps=500, # NEW: Gentle warmup to prevent wild guessing early on
	weight_decay=0.01,
	save_strategy="epoch",
	metric_for_best_model="f1_macro", # NEW: Tell AI to prioritize F1 over basic accuracy
	load_best_model_at_end=True # NEW: Automatically save the smartest brain
	# overwrite_output_dir=True,
	)

	# --- NEW: CUSTOM TRAINER OVERRIDE ---
	class ImbalancedTrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
	# Grab the actual answers
	labels = inputs.pop("labels")
	# Make a prediction
	outputs = model(**inputs)
	logits = outputs.logits
	# Calculate the error using our Custom Penalty Weights!
	loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor)
	loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
	return (loss, outputs) if return_outputs else loss

	# 7. Start Training!
	trainer = ImbalancedTrainer( # UPDATED: Using the strict custom trainer
	model=model,
	args=training_args,
	train_dataset=tokenized_train,
	eval_dataset=tokenized_val,
	compute_metrics=compute_metrics # NEW: Attach the strict grader
	)

	print("🔥 Starting actual model training! (This might take a while depending on your computer)...")
	trainer.train()

	# 8. Save the final model
	final_model_dir = os.path.join(artifacts_dir, "final_mindguard_model")
	print(f"✅ Training complete. Saving the brain to {final_model_dir}...")
	trainer.save_model(final_model_dir)
	tokenizer.save_pretrained(final_model_dir)

	if __name__ == "__main__":
	train_mindguard_model()